diff options
Diffstat (limited to 'arch')
150 files changed, 2607 insertions, 1993 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 5a1692392a4d..678a80713b21 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -648,8 +648,7 @@ config ARCH_SUPPORTS_LTO_CLANG_THIN config HAS_LTO_CLANG def_bool y - # Clang >= 11: https://github.com/ClangBuiltLinux/linux/issues/510 - depends on CC_IS_CLANG && CLANG_VERSION >= 110000 && LD_IS_LLD && AS_IS_LLVM + depends on CC_IS_CLANG && LD_IS_LLD && AS_IS_LLVM depends on $(success,$(NM) --help | head -n 1 | grep -qi llvm) depends on $(success,$(AR) --help | head -n 1 | grep -qi llvm) depends on ARCH_SUPPORTS_LTO_CLANG @@ -998,6 +997,10 @@ config PAGE_SIZE_LESS_THAN_64KB depends on !PAGE_SIZE_64KB depends on !PARISC_PAGE_SIZE_64KB depends on !PPC_64K_PAGES + depends on PAGE_SIZE_LESS_THAN_256KB + +config PAGE_SIZE_LESS_THAN_256KB + def_bool y depends on !PPC_256K_PAGES depends on !PAGE_SIZE_256KB diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c index ce3077946e1d..fb3025396ac9 100644 --- a/arch/alpha/kernel/rtc.c +++ b/arch/alpha/kernel/rtc.c @@ -80,7 +80,12 @@ init_rtc_epoch(void) static int alpha_rtc_read_time(struct device *dev, struct rtc_time *tm) { - mc146818_get_time(tm); + int ret = mc146818_get_time(tm); + + if (ret < 0) { + dev_err_ratelimited(dev, "unable to read current time\n"); + return ret; + } /* Adjust for non-default epochs. It's easier to depend on the generic __get_rtc_time and adjust the epoch here than create diff --git a/arch/arc/Makefile b/arch/arc/Makefile index f252e7b924e9..efc54f3e35e0 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -14,10 +14,10 @@ cflags-y += -fno-common -pipe -fno-builtin -mmedium-calls -D__linux__ tune-mcpu-def-$(CONFIG_ISA_ARCOMPACT) := -mcpu=arc700 tune-mcpu-def-$(CONFIG_ISA_ARCV2) := -mcpu=hs38 -ifeq ($(CONFIG_ARC_TUNE_MCPU),"") +ifeq ($(CONFIG_ARC_TUNE_MCPU),) cflags-y += $(tune-mcpu-def-y) else -tune-mcpu := $(shell echo $(CONFIG_ARC_TUNE_MCPU)) +tune-mcpu := $(CONFIG_ARC_TUNE_MCPU) ifneq ($(call cc-option,$(tune-mcpu)),) cflags-y += $(tune-mcpu) else diff --git a/arch/arc/boot/dts/Makefile b/arch/arc/boot/dts/Makefile index 8483a86c743d..4237aa5de3a3 100644 --- a/arch/arc/boot/dts/Makefile +++ b/arch/arc/boot/dts/Makefile @@ -2,8 +2,8 @@ # Built-in dtb builtindtb-y := nsim_700 -ifneq ($(CONFIG_ARC_BUILTIN_DTB_NAME),"") - builtindtb-y := $(patsubst "%",%,$(CONFIG_ARC_BUILTIN_DTB_NAME)) +ifneq ($(CONFIG_ARC_BUILTIN_DTB_NAME),) + builtindtb-y := $(CONFIG_ARC_BUILTIN_DTB_NAME) endif obj-y += $(builtindtb-y).dtb.o diff --git a/arch/arc/include/asm/irqflags-compact.h b/arch/arc/include/asm/irqflags-compact.h index 863d63ad18d6..0d63e568d64c 100644 --- a/arch/arc/include/asm/irqflags-compact.h +++ b/arch/arc/include/asm/irqflags-compact.h @@ -50,8 +50,12 @@ * are redone after IRQs are re-enabled (and gcc doesn't reuse stale register) * * Noted at the time of Abilis Timer List corruption - * Orig Bug + Rejected solution : https://lkml.org/lkml/2013/3/29/67 - * Reasoning : https://lkml.org/lkml/2013/4/8/15 + * + * Orig Bug + Rejected solution: + * https://lore.kernel.org/lkml/1364553218-31255-1-git-send-email-vgupta@synopsys.com + * + * Reasoning: + * https://lore.kernel.org/lkml/CA+55aFyFWjpSVQM6M266tKrG_ZXJzZ-nYejpmXYQXbrr42mGPQ@mail.gmail.com * ******************************************************************/ diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index e1971d34ef30..4c919c0f4b30 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -63,166 +63,4 @@ struct arc_reg_cc_build { #define PERF_COUNT_ARC_HW_MAX (PERF_COUNT_HW_MAX + 8) -/* - * Some ARC pct quirks: - * - * PERF_COUNT_HW_STALLED_CYCLES_BACKEND - * PERF_COUNT_HW_STALLED_CYCLES_FRONTEND - * The ARC 700 can either measure stalls per pipeline stage, or all stalls - * combined; for now we assign all stalls to STALLED_CYCLES_BACKEND - * and all pipeline flushes (e.g. caused by mispredicts, etc.) to - * STALLED_CYCLES_FRONTEND. - * - * We could start multiple performance counters and combine everything - * afterwards, but that makes it complicated. - * - * Note that I$ cache misses aren't counted by either of the two! - */ - -/* - * ARC PCT has hardware conditions with fixed "names" but variable "indexes" - * (based on a specific RTL build) - * Below is the static map between perf generic/arc specific event_id and - * h/w condition names. - * At the time of probe, we loop thru each index and find it's name to - * complete the mapping of perf event_id to h/w index as latter is needed - * to program the counter really - */ -static const char * const arc_pmu_ev_hw_map[] = { - /* count cycles */ - [PERF_COUNT_HW_CPU_CYCLES] = "crun", - [PERF_COUNT_HW_REF_CPU_CYCLES] = "crun", - [PERF_COUNT_HW_BUS_CYCLES] = "crun", - - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = "bflush", - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = "bstall", - - /* counts condition */ - [PERF_COUNT_HW_INSTRUCTIONS] = "iall", - /* All jump instructions that are taken */ - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmptak", -#ifdef CONFIG_ISA_ARCV2 - [PERF_COUNT_HW_BRANCH_MISSES] = "bpmp", -#else - [PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */ - [PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */ -#endif - [PERF_COUNT_ARC_LDC] = "imemrdc", /* Instr: mem read cached */ - [PERF_COUNT_ARC_STC] = "imemwrc", /* Instr: mem write cached */ - - [PERF_COUNT_ARC_DCLM] = "dclm", /* D-cache Load Miss */ - [PERF_COUNT_ARC_DCSM] = "dcsm", /* D-cache Store Miss */ - [PERF_COUNT_ARC_ICM] = "icm", /* I-cache Miss */ - [PERF_COUNT_ARC_EDTLB] = "edtlb", /* D-TLB Miss */ - [PERF_COUNT_ARC_EITLB] = "eitlb", /* I-TLB Miss */ - - [PERF_COUNT_HW_CACHE_REFERENCES] = "imemrdc", /* Instr: mem read cached */ - [PERF_COUNT_HW_CACHE_MISSES] = "dclm", /* D-cache Load Miss */ -}; - -#define C(_x) PERF_COUNT_HW_CACHE_##_x -#define CACHE_OP_UNSUPPORTED 0xffff - -static const unsigned int arc_pmu_cache_map[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { - [C(L1D)] = { - [C(OP_READ)] = { - [C(RESULT_ACCESS)] = PERF_COUNT_ARC_LDC, - [C(RESULT_MISS)] = PERF_COUNT_ARC_DCLM, - }, - [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = PERF_COUNT_ARC_STC, - [C(RESULT_MISS)] = PERF_COUNT_ARC_DCSM, - }, - [C(OP_PREFETCH)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - }, - [C(L1I)] = { - [C(OP_READ)] = { - [C(RESULT_ACCESS)] = PERF_COUNT_HW_INSTRUCTIONS, - [C(RESULT_MISS)] = PERF_COUNT_ARC_ICM, - }, - [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - [C(OP_PREFETCH)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - }, - [C(LL)] = { - [C(OP_READ)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - [C(OP_PREFETCH)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - }, - [C(DTLB)] = { - [C(OP_READ)] = { - [C(RESULT_ACCESS)] = PERF_COUNT_ARC_LDC, - [C(RESULT_MISS)] = PERF_COUNT_ARC_EDTLB, - }, - /* DTLB LD/ST Miss not segregated by h/w*/ - [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - [C(OP_PREFETCH)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - }, - [C(ITLB)] = { - [C(OP_READ)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = PERF_COUNT_ARC_EITLB, - }, - [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - [C(OP_PREFETCH)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - }, - [C(BPU)] = { - [C(OP_READ)] = { - [C(RESULT_ACCESS)] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, - [C(RESULT_MISS)] = PERF_COUNT_HW_BRANCH_MISSES, - }, - [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - [C(OP_PREFETCH)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - }, - [C(NODE)] = { - [C(OP_READ)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - [C(OP_PREFETCH)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, - [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, - }, - }, -}; - #endif /* __ASM_PERF_EVENT_H */ diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index c0942c24d401..d36863e34bfc 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -99,8 +99,8 @@ static inline __attribute_const__ struct thread_info *current_thread_info(void) /* * _TIF_ALLWORK_MASK includes SYSCALL_TRACE, but we don't need it. - * SYSCALL_TRACE is anyway seperately/unconditionally tested right after a - * syscall, so all that reamins to be tested is _TIF_WORK_MASK + * SYSCALL_TRACE is anyway separately/unconditionally tested right after a + * syscall, so all that remains to be tested is _TIF_WORK_MASK */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 145722f80c9b..adff957962da 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -17,6 +17,168 @@ /* HW holds 8 symbols + one for null terminator */ #define ARCPMU_EVENT_NAME_LEN 9 +/* + * Some ARC pct quirks: + * + * PERF_COUNT_HW_STALLED_CYCLES_BACKEND + * PERF_COUNT_HW_STALLED_CYCLES_FRONTEND + * The ARC 700 can either measure stalls per pipeline stage, or all stalls + * combined; for now we assign all stalls to STALLED_CYCLES_BACKEND + * and all pipeline flushes (e.g. caused by mispredicts, etc.) to + * STALLED_CYCLES_FRONTEND. + * + * We could start multiple performance counters and combine everything + * afterwards, but that makes it complicated. + * + * Note that I$ cache misses aren't counted by either of the two! + */ + +/* + * ARC PCT has hardware conditions with fixed "names" but variable "indexes" + * (based on a specific RTL build) + * Below is the static map between perf generic/arc specific event_id and + * h/w condition names. + * At the time of probe, we loop thru each index and find it's name to + * complete the mapping of perf event_id to h/w index as latter is needed + * to program the counter really + */ +static const char * const arc_pmu_ev_hw_map[] = { + /* count cycles */ + [PERF_COUNT_HW_CPU_CYCLES] = "crun", + [PERF_COUNT_HW_REF_CPU_CYCLES] = "crun", + [PERF_COUNT_HW_BUS_CYCLES] = "crun", + + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = "bflush", + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = "bstall", + + /* counts condition */ + [PERF_COUNT_HW_INSTRUCTIONS] = "iall", + /* All jump instructions that are taken */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmptak", +#ifdef CONFIG_ISA_ARCV2 + [PERF_COUNT_HW_BRANCH_MISSES] = "bpmp", +#else + [PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */ + [PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */ +#endif + [PERF_COUNT_ARC_LDC] = "imemrdc", /* Instr: mem read cached */ + [PERF_COUNT_ARC_STC] = "imemwrc", /* Instr: mem write cached */ + + [PERF_COUNT_ARC_DCLM] = "dclm", /* D-cache Load Miss */ + [PERF_COUNT_ARC_DCSM] = "dcsm", /* D-cache Store Miss */ + [PERF_COUNT_ARC_ICM] = "icm", /* I-cache Miss */ + [PERF_COUNT_ARC_EDTLB] = "edtlb", /* D-TLB Miss */ + [PERF_COUNT_ARC_EITLB] = "eitlb", /* I-TLB Miss */ + + [PERF_COUNT_HW_CACHE_REFERENCES] = "imemrdc", /* Instr: mem read cached */ + [PERF_COUNT_HW_CACHE_MISSES] = "dclm", /* D-cache Load Miss */ +}; + +#define C(_x) PERF_COUNT_HW_CACHE_##_x +#define CACHE_OP_UNSUPPORTED 0xffff + +static const unsigned int arc_pmu_cache_map[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + [C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = PERF_COUNT_ARC_LDC, + [C(RESULT_MISS)] = PERF_COUNT_ARC_DCLM, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = PERF_COUNT_ARC_STC, + [C(RESULT_MISS)] = PERF_COUNT_ARC_DCSM, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + [C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = PERF_COUNT_HW_INSTRUCTIONS, + [C(RESULT_MISS)] = PERF_COUNT_ARC_ICM, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + [C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = PERF_COUNT_ARC_LDC, + [C(RESULT_MISS)] = PERF_COUNT_ARC_EDTLB, + }, + /* DTLB LD/ST Miss not segregated by h/w*/ + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + [C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = PERF_COUNT_ARC_EITLB, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + [C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + [C(RESULT_MISS)] = PERF_COUNT_HW_BRANCH_MISSES, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + [C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, +}; + enum arc_pmu_attr_groups { ARCPMU_ATTR_GR_EVENTS, ARCPMU_ATTR_GR_FORMATS, @@ -328,7 +490,7 @@ static void arc_pmu_stop(struct perf_event *event, int flags) } if (!(event->hw.state & PERF_HES_STOPPED)) { - /* stop ARC pmu here */ + /* stop hw counter here */ write_aux_reg(ARC_REG_PCT_INDEX, idx); /* condition code #0 is always "never" */ @@ -361,7 +523,7 @@ static int arc_pmu_add(struct perf_event *event, int flags) { struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; + int idx; idx = ffz(pmu_cpu->used_mask[0]); if (idx == arc_pmu->n_counters) diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c index 9e28058cdba8..200270a94558 100644 --- a/arch/arc/kernel/unwind.c +++ b/arch/arc/kernel/unwind.c @@ -245,14 +245,9 @@ static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) { struct eh_frame_hdr_table_entry *e1 = p1; struct eh_frame_hdr_table_entry *e2 = p2; - unsigned long v; - - v = e1->start; - e1->start = e2->start; - e2->start = v; - v = e1->fde; - e1->fde = e2->fde; - e2->fde = v; + + swap(e1->start, e2->start); + swap(e1->fde, e2->fde); } static void init_unwind_hdr(struct unwind_table *table, diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 517988e60cfc..2a7fbbb83b70 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -32,7 +32,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size) /* * Cache operations depending on function and direction argument, inspired by - * https://lkml.org/lkml/2018/5/18/979 + * https://lore.kernel.org/lkml/20180518175004.GF17671@n2100.armlinux.org.uk * "dma_sync_*_for_cpu and direction=TO_DEVICE (was Re: [PATCH 02/20] * dma-mapping: provide a generic dma-noncoherent implementation)" * diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index 63ea5a606ecd..b821df7b0089 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -50,7 +50,7 @@ static void __init axs10x_enable_gpio_intc_wire(void) * Current implementation of "irq-dw-apb-ictl" driver doesn't work well * with stacked INTCs. In particular problem happens if its master INTC * not yet instantiated. See discussion here - - * https://lkml.org/lkml/2015/3/4/755 + * https://lore.kernel.org/lkml/54F6FE2C.7020309@synopsys.com * * So setup the first gpio block as a passive pass thru and hide it from * DT hardware topology - connect MB intc directly to cpu intc diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c index b3ea1fa11f87..c4a875b22352 100644 --- a/arch/arc/plat-hsdk/platform.c +++ b/arch/arc/plat-hsdk/platform.c @@ -52,7 +52,7 @@ static void __init hsdk_enable_gpio_intc_wire(void) * Current implementation of "irq-dw-apb-ictl" driver doesn't work well * with stacked INTCs. In particular problem happens if its master INTC * not yet instantiated. See discussion here - - * https://lkml.org/lkml/2015/3/4/755 + * https://lore.kernel.org/lkml/54F6FE2C.7020309@synopsys.com * * So setup the first gpio block as a passive pass thru and hide it from * DT hardware topology - connect intc directly to cpu intc diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index cb9e48dcba88..976315dea958 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -66,8 +66,6 @@ config UNWINDER_FRAME_POINTER config UNWINDER_ARM bool "ARM EABI stack unwinder" depends on AEABI && !FUNCTION_GRAPH_TRACER - # https://github.com/ClangBuiltLinux/linux/issues/732 - depends on !LD_IS_LLD || LLD_VERSION >= 110000 select ARM_UNWIND help This option enables stack unwinding support in the kernel diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 74d2f1401acb..954eee8a785a 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -76,10 +76,10 @@ CPPFLAGS_vmlinux.lds += -DTEXT_OFFSET="$(TEXT_OFFSET)" CPPFLAGS_vmlinux.lds += -DMALLOC_SIZE="$(MALLOC_SIZE)" compress-$(CONFIG_KERNEL_GZIP) = gzip -compress-$(CONFIG_KERNEL_LZO) = lzo -compress-$(CONFIG_KERNEL_LZMA) = lzma -compress-$(CONFIG_KERNEL_XZ) = xzkern -compress-$(CONFIG_KERNEL_LZ4) = lz4 +compress-$(CONFIG_KERNEL_LZO) = lzo_with_size +compress-$(CONFIG_KERNEL_LZMA) = lzma_with_size +compress-$(CONFIG_KERNEL_XZ) = xzkern_with_size +compress-$(CONFIG_KERNEL_LZ4) = lz4_with_size libfdt_objs := fdt_rw.o fdt_ro.o fdt_wip.o fdt.o diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f6e333b59314..dc10d26cb432 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1136,6 +1136,10 @@ config NUMA select GENERIC_ARCH_NUMA select ACPI_NUMA if ACPI select OF_NUMA + select HAVE_SETUP_PER_CPU_AREA + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK + select USE_PERCPU_NUMA_NODE_ID help Enable NUMA (Non-Uniform Memory Access) support. @@ -1152,22 +1156,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - -config HAVE_SETUP_PER_CPU_AREA - def_bool y - depends on NUMA - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y - depends on NUMA - -config NEED_PER_CPU_PAGE_FIRST_CHUNK - def_bool y - depends on NUMA - source "kernel/Kconfig.hz" config ARCH_SPARSEMEM_ENABLE diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index d955ade5df7c..5d460f6b7675 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -249,7 +249,7 @@ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \ " mov %" #w "[tmp], %" #w "[old]\n" \ " cas" #mb #sfx "\t%" #w "[tmp], %" #w "[new], %[v]\n" \ " mov %" #w "[ret], %" #w "[tmp]" \ - : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr), \ + : [ret] "+r" (x0), [v] "+Q" (*(u##sz *)ptr), \ [tmp] "=&r" (tmp) \ : [old] "r" (x1), [new] "r" (x2) \ : cl); \ diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index f9bef42c1411..497acf134d99 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -243,7 +243,7 @@ static inline void __cmpwait_case_##sz(volatile void *ptr, \ " cbnz %" #w "[tmp], 1f\n" \ " wfe\n" \ "1:" \ - : [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr) \ + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \ : [val] "r" (val)); \ } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index a8834434af99..db63cc885771 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -172,7 +172,7 @@ int pfn_is_map_memory(unsigned long pfn) } EXPORT_SYMBOL(pfn_is_map_memory); -static phys_addr_t memory_limit = PHYS_ADDR_MAX; +static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX; /* * Limit the memory size that was specified via FDT. diff --git a/arch/h8300/boot/compressed/Makefile b/arch/h8300/boot/compressed/Makefile index 5942793f77a0..6ab2fa5ba105 100644 --- a/arch/h8300/boot/compressed/Makefile +++ b/arch/h8300/boot/compressed/Makefile @@ -30,9 +30,11 @@ $(obj)/vmlinux.bin: vmlinux FORCE suffix-$(CONFIG_KERNEL_GZIP) := gzip suffix-$(CONFIG_KERNEL_LZO) := lzo +compress-$(CONFIG_KERNEL_GZIP) := gzip +compress-$(CONFIG_KERNEL_LZO) := lzo_with_size $(obj)/vmlinux.bin.$(suffix-y): $(obj)/vmlinux.bin FORCE - $(call if_changed,$(suffix-y)) + $(call if_changed,$(compress-y)) LDFLAGS_piggy.o := -r --format binary --oformat elf32-h8300-linux -T OBJCOPYFLAGS := -O binary diff --git a/arch/h8300/boot/dts/Makefile b/arch/h8300/boot/dts/Makefile index 69fcd817892c..c36bbd1f2592 100644 --- a/arch/h8300/boot/dts/Makefile +++ b/arch/h8300/boot/dts/Makefile @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ifneq '$(CONFIG_H8300_BUILTIN_DTB)' '""' -BUILTIN_DTB := $(patsubst "%",%,$(CONFIG_H8300_BUILTIN_DTB)).dtb.o -endif - -obj-y += $(BUILTIN_DTB) +obj-y += $(addsuffix .dtb.o, $(CONFIG_H8300_BUILTIN_DTB)) dtb-$(CONFIG_H8300H_SIM) := h8300h_sim.dtb dtb-$(CONFIG_H8S_SIM) := h8s_sim.dtb diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 1e33666fa679..703952819e10 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -32,6 +32,7 @@ config IA64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE if (!ITANIUM) select HAVE_FUNCTION_TRACER + select HAVE_SETUP_PER_CPU_AREA select TTY select HAVE_ARCH_TRACEHOOK select HAVE_VIRT_CPU_ACCOUNTING @@ -88,9 +89,6 @@ config GENERIC_CALIBRATE_DELAY bool default y -config HAVE_SETUP_PER_CPU_AREA - def_bool y - config DMI bool default y @@ -292,6 +290,7 @@ config NUMA bool "NUMA support" depends on !FLATMEM select SMP + select USE_PERCPU_NUMA_NODE_ID help Say Y to compile the kernel to support NUMA (Non-Uniform Memory Access). This option is for configuring high-end multiprocessor @@ -311,10 +310,6 @@ config HAVE_ARCH_NODEDATA_EXTENSION def_bool y depends on NUMA -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - config HAVE_MEMORYLESS_NODES def_bool NUMA diff --git a/arch/microblaze/Makefile b/arch/microblaze/Makefile index e775a696aa6f..1826d9ce4459 100644 --- a/arch/microblaze/Makefile +++ b/arch/microblaze/Makefile @@ -5,10 +5,10 @@ UTS_SYSNAME = -DUTS_SYSNAME=\"Linux\" # What CPU version are we building for, and crack it open # as major.minor.rev -CPU_VER := $(shell echo $(CONFIG_XILINX_MICROBLAZE0_HW_VER)) -CPU_MAJOR := $(shell echo $(CPU_VER) | cut -d '.' -f 1) -CPU_MINOR := $(shell echo $(CPU_VER) | cut -d '.' -f 2) -CPU_REV := $(shell echo $(CPU_VER) | cut -d '.' -f 3) +CPU_VER := $(CONFIG_XILINX_MICROBLAZE0_HW_VER) +CPU_MAJOR := $(word 1, $(subst ., , $(CPU_VER))) +CPU_MINOR := $(word 2, $(subst ., , $(CPU_VER))) +CPU_REV := $(word 3, $(subst ., , $(CPU_VER))) export CPU_VER CPU_MAJOR CPU_MINOR CPU_REV diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 3dd8c4618293..edf6c1577449 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2674,6 +2674,8 @@ config NUMA bool "NUMA Support" depends on SYS_SUPPORTS_NUMA select SMP + select HAVE_SETUP_PER_CPU_AREA + select NEED_PER_CPU_EMBED_FIRST_CHUNK help Say Y to compile the kernel to support NUMA (Non-Uniform Memory Access). This option improves performance on systems with more @@ -2684,14 +2686,6 @@ config NUMA config SYS_SUPPORTS_NUMA bool -config HAVE_SETUP_PER_CPU_AREA - def_bool y - depends on NUMA - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y - depends on NUMA - config RELOCATABLE bool "Relocatable kernel" depends on SYS_SUPPORTS_RELOCATABLE diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 8b03ef13133a..5a15d51e8884 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -64,12 +64,12 @@ $(obj)/vmlinux.bin: $(KBUILD_IMAGE) FORCE $(call if_changed,objcopy) tool_$(CONFIG_KERNEL_GZIP) = gzip -tool_$(CONFIG_KERNEL_BZIP2) = bzip2 -tool_$(CONFIG_KERNEL_LZ4) = lz4 -tool_$(CONFIG_KERNEL_LZMA) = lzma -tool_$(CONFIG_KERNEL_LZO) = lzo -tool_$(CONFIG_KERNEL_XZ) = xzkern -tool_$(CONFIG_KERNEL_ZSTD) = zstd22 +tool_$(CONFIG_KERNEL_BZIP2) = bzip2_with_size +tool_$(CONFIG_KERNEL_LZ4) = lz4_with_size +tool_$(CONFIG_KERNEL_LZMA) = lzma_with_size +tool_$(CONFIG_KERNEL_LZO) = lzo_with_size +tool_$(CONFIG_KERNEL_XZ) = xzkern_with_size +tool_$(CONFIG_KERNEL_ZSTD) = zstd22_with_size targets += vmlinux.bin.z diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 325e1552cbea..5a8002839550 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -519,17 +519,9 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(cpu_to_node(from), cpu_to_node(to)); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, - size_t align) +static int __init pcpu_cpu_to_node(int cpu) { - return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_ACCESSIBLE, - cpu_to_node(cpu)); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); + return cpu_to_node(cpu); } void __init setup_per_cpu_areas(void) @@ -545,7 +537,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, - pcpu_fc_alloc, pcpu_fc_free); + pcpu_cpu_to_node); if (rc < 0) panic("Failed to initialize percpu areas."); diff --git a/arch/nds32/Makefile b/arch/nds32/Makefile index 797ad9b450af..b33d5d81b6ae 100644 --- a/arch/nds32/Makefile +++ b/arch/nds32/Makefile @@ -31,12 +31,6 @@ core-y += arch/nds32/kernel/ arch/nds32/mm/ core-$(CONFIG_FPU) += arch/nds32/math-emu/ libs-y += arch/nds32/lib/ -ifneq '$(CONFIG_NDS32_BUILTIN_DTB)' '""' -BUILTIN_DTB := y -else -BUILTIN_DTB := n -endif - ifdef CONFIG_CPU_LITTLE_ENDIAN KBUILD_CFLAGS += $(call cc-option, -EL) KBUILD_AFLAGS += $(call cc-option, -EL) diff --git a/arch/nds32/boot/dts/Makefile b/arch/nds32/boot/dts/Makefile index f84bd529b6fd..4fc69562eae8 100644 --- a/arch/nds32/boot/dts/Makefile +++ b/arch/nds32/boot/dts/Makefile @@ -1,7 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -ifneq '$(CONFIG_NDS32_BUILTIN_DTB)' '""' -BUILTIN_DTB := $(patsubst "%",%,$(CONFIG_NDS32_BUILTIN_DTB)).dtb.o -else -BUILTIN_DTB := -endif -obj-$(CONFIG_OF) += $(BUILTIN_DTB) +obj-$(CONFIG_OF) += $(addsuffix .dtb.o, $(CONFIG_NDS32_BUILTIN_DTB)) diff --git a/arch/nios2/boot/dts/Makefile b/arch/nios2/boot/dts/Makefile index a91a0b09be63..e9e31bb40df8 100644 --- a/arch/nios2/boot/dts/Makefile +++ b/arch/nios2/boot/dts/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := $(patsubst "%.dts",%.dtb.o,$(CONFIG_NIOS2_DTB_SOURCE)) +obj-y := $(patsubst %.dts,%.dtb.o,$(CONFIG_NIOS2_DTB_SOURCE)) dtstree := $(srctree)/$(src) dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) diff --git a/arch/openrisc/boot/dts/Makefile b/arch/openrisc/boot/dts/Makefile index 17dd791a833f..13db5a2aab52 100644 --- a/arch/openrisc/boot/dts/Makefile +++ b/arch/openrisc/boot/dts/Makefile @@ -1,9 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -ifneq '$(CONFIG_OPENRISC_BUILTIN_DTB)' '""' -BUILTIN_DTB := $(patsubst "%",%,$(CONFIG_OPENRISC_BUILTIN_DTB)).dtb.o -else -BUILTIN_DTB := -endif -obj-y += $(BUILTIN_DTB) +obj-y += $(addsuffix .dtb.o, $(CONFIG_OPENRISC_BUILTIN_DTB)) #DTC_FLAGS ?= -p 1024 diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile index 116bd5c1873c..a294a1b58ee7 100644 --- a/arch/parisc/boot/compressed/Makefile +++ b/arch/parisc/boot/compressed/Makefile @@ -50,8 +50,6 @@ OBJCOPYFLAGS_vmlinux.bin := -R .comment -R .note -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -vmlinux.bin.all-y := $(obj)/vmlinux.bin - suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZ4) := lz4 @@ -59,18 +57,18 @@ suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_XZ) := xz -$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lz4) -$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) -$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) -$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) +$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE + $(call if_changed,bzip2_with_size) +$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE + $(call if_changed,lz4_with_size) +$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzma_with_size) +$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzo_with_size) +$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE + $(call if_changed,xzkern_with_size) LDFLAGS_piggy.o := -r --format binary --oformat $(LD_BFD) -T $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index b669f4b9040b..3a3d05438408 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -289,6 +289,7 @@ extern int _parisc_requires_coherency; extern int running_on_qemu; +extern void __noreturn toc_intr(struct pt_regs *regs); extern void toc_handler(void); extern unsigned int toc_handler_size; extern unsigned int toc_handler_csum; diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index cceb09855e03..b91cb45ffd4e 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -48,6 +48,7 @@ struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL; void __init setup_cmdline(char **cmdline_p) { extern unsigned int boot_args[]; + char *p; /* Collect stuff passed in from the boot loader */ @@ -56,9 +57,19 @@ void __init setup_cmdline(char **cmdline_p) /* called from hpux boot loader */ boot_command_line[0] = '\0'; } else { - strlcpy(boot_command_line, (char *)__va(boot_args[1]), + strscpy(boot_command_line, (char *)__va(boot_args[1]), COMMAND_LINE_SIZE); + /* autodetect console type (if not done by palo yet) */ + p = boot_command_line; + if (!str_has_prefix(p, "console=") && !strstr(p, " console=")) { + strlcat(p, " console=", COMMAND_LINE_SIZE); + if (PAGE0->mem_cons.cl_class == CL_DUPLEX) + strlcat(p, "ttyS0", COMMAND_LINE_SIZE); + else + strlcat(p, "tty0", COMMAND_LINE_SIZE); + } + #ifdef CONFIG_BLK_DEV_INITRD if (boot_args[2] != 0) /* did palo pass us a ramdisk? */ { @@ -68,7 +79,7 @@ void __init setup_cmdline(char **cmdline_p) #endif } - strcpy(command_line, boot_command_line); + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; } diff --git a/arch/parisc/kernel/toc.c b/arch/parisc/kernel/toc.c index be9a0bebe61e..e4b48d07afbd 100644 --- a/arch/parisc/kernel/toc.c +++ b/arch/parisc/kernel/toc.c @@ -10,9 +10,10 @@ #include <asm/pdc.h> #include <asm/pdc_chassis.h> #include <asm/ldcw.h> +#include <asm/processor.h> static unsigned int __aligned(16) toc_lock = 1; -DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack); +DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack) __visible; static void toc20_to_pt_regs(struct pt_regs *regs, struct pdc_toc_pim_20 *toc) { diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 0631c9241af3..b779603978e1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -55,15 +55,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MIN default 9 if PPC_16K_PAGES # 9 = 23 (8MB) - 14 (16K) default 11 # 11 = 23 (8MB) - 12 (4K) -config HAVE_SETUP_PER_CPU_AREA - def_bool PPC64 - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y if PPC64 - -config NEED_PER_CPU_PAGE_FIRST_CHUNK - def_bool y if PPC64 - config NR_IRQS int "Number of virtual interrupt numbers" range 32 1048576 @@ -241,6 +232,7 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ + select HAVE_SETUP_PER_CPU_AREA if PPC64 select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) @@ -255,6 +247,8 @@ config PPC select MMU_GATHER_RCU_TABLE_FREE select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE + select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64 + select NEED_PER_CPU_PAGE_FIRST_CHUNK if PPC64 select NEED_SG_DMA_LENGTH select OF select OF_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE @@ -660,6 +654,7 @@ config NUMA bool "NUMA Memory Allocation and Scheduler Support" depends on PPC64 && SMP default y if PPC_PSERIES || PPC_POWERNV + select USE_PERCPU_NUMA_NODE_ID help Enable NUMA (Non-Uniform Memory Access) support. @@ -673,10 +668,6 @@ config NODES_SHIFT default "4" depends on NUMA -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - config HAVE_MEMORYLESS_NODES def_bool y depends on NUMA diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 9993c6256ad2..4b4827c475c6 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -365,7 +365,7 @@ image-$(CONFIG_PPC_PMAC) += zImage.coff zImage.miboot endif # Allow extra targets to be added to the defconfig -image-y += $(subst ",,$(CONFIG_EXTRA_TARGETS)) +image-y += $(CONFIG_EXTRA_TARGETS) initrd- := $(patsubst zImage%, zImage.initrd%, $(image-)) initrd-y := $(patsubst zImage%, zImage.initrd%, \ diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi index c90702b04a53..48e5cd61599c 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi @@ -79,6 +79,7 @@ fman0: fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xfc000 0x1000>; + fsl,erratum-a009885; }; xmdio0: mdio@fd000 { @@ -86,6 +87,7 @@ fman0: fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xfd000 0x1000>; + fsl,erratum-a009885; }; }; diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index e9c945b123c6..e46143c32308 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -168,6 +168,11 @@ interrupts = <14>; }; + srnprot@d800060 { + compatible = "nintendo,hollywood-srnprot"; + reg = <0x0d800060 0x4>; + }; + GPIO: gpio@d8000c0 { #gpio-cells = <2>; compatible = "nintendo,hollywood-gpio"; diff --git a/arch/powerpc/configs/gamecube_defconfig b/arch/powerpc/configs/gamecube_defconfig index 24c0e0ea5aeb..91a1b99f4e8f 100644 --- a/arch/powerpc/configs/gamecube_defconfig +++ b/arch/powerpc/configs/gamecube_defconfig @@ -68,7 +68,7 @@ CONFIG_SND_SEQUENCER=y CONFIG_SND_SEQUENCER_OSS=y # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_GENERIC=y +CONFIG_RTC_DRV_GAMECUBE=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_ISO9660_FS=y diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index a0c45bf2bfb1..0ab78c51455d 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -98,7 +98,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_LEDS_TRIGGER_PANIC=y CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_GENERIC=y +CONFIG_RTC_DRV_GAMECUBE=y CONFIG_NVMEM_NINTENDO_OTP=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index d87f7c1103ce..be8577ac9397 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -771,50 +771,6 @@ void __init emergency_stack_init(void) } #ifdef CONFIG_SMP -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, - size_t align) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = early_cpu_to_node(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -static void __init pcpu_free_bootmem(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int pcpu_cpu_distance(unsigned int from, unsigned int to) { if (early_cpu_to_node(from) == early_cpu_to_node(to)) @@ -823,53 +779,13 @@ static int pcpu_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init pcpu_populate_pte(unsigned long addr) +static __init int pcpu_cpu_to_node(int cpu) { - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!new) - goto err_alloc; - p4d_populate(&init_mm, p4d, new); - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!new) - goto err_alloc; - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); - if (!new) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, new); - } - - return; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + return early_cpu_to_node(cpu); } +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { @@ -900,7 +816,7 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_PAGE) { rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_alloc_bootmem, pcpu_free_bootmem); + pcpu_cpu_to_node); if (rc) pr_warn("PERCPU: %s allocator failed (%d), " "falling back to page size\n", @@ -908,8 +824,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) - rc = pcpu_page_first_chunk(0, pcpu_alloc_bootmem, pcpu_free_bootmem, - pcpu_populate_pte); + rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 09abf62ae0ad..5adcbd9b5e88 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -14,6 +14,7 @@ config RISCV def_bool y select ARCH_CLOCKSOURCE_INIT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_VIRTUAL if MMU @@ -75,6 +76,7 @@ config RISCV select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU + select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_VMAP_STACK if MMU && 64BIT select HAVE_ASM_MODVERSIONS @@ -145,27 +147,16 @@ config MMU Select if you want MMU-based virtualised addressing space support by paged memory management. If unsure, say 'Y'. -config VA_BITS - int - default 32 if 32BIT - default 39 if 64BIT - -config PA_BITS - int - default 34 if 32BIT - default 56 if 64BIT - config PAGE_OFFSET hex - default 0xC0000000 if 32BIT && MAXPHYSMEM_1GB + default 0xC0000000 if 32BIT default 0x80000000 if 64BIT && !MMU - default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB - default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB + default 0xffffaf8000000000 if 64BIT config KASAN_SHADOW_OFFSET hex depends on KASAN_GENERIC - default 0xdfffffc800000000 if 64BIT + default 0xdfffffff00000000 if 64BIT default 0xffffffff if 32BIT config ARCH_FLATMEM_ENABLE @@ -211,7 +202,7 @@ config FIX_EARLYCON_MEM config PGTABLE_LEVELS int - default 3 if 64BIT + default 4 if 64BIT default 2 config LOCKDEP_SUPPORT @@ -269,24 +260,6 @@ config MODULE_SECTIONS bool select HAVE_MOD_ARCH_SPECIFIC -choice - prompt "Maximum Physical Memory" - default MAXPHYSMEM_1GB if 32BIT - default MAXPHYSMEM_2GB if 64BIT && CMODEL_MEDLOW - default MAXPHYSMEM_128GB if 64BIT && CMODEL_MEDANY - - config MAXPHYSMEM_1GB - depends on 32BIT - bool "1GiB" - config MAXPHYSMEM_2GB - depends on 64BIT && CMODEL_MEDLOW - bool "2GiB" - config MAXPHYSMEM_128GB - depends on 64BIT && CMODEL_MEDANY - bool "128GiB" -endchoice - - config SMP bool "Symmetric Multi-Processing" help @@ -333,6 +306,8 @@ config NUMA select GENERIC_ARCH_NUMA select OF_NUMA select ARCH_SUPPORTS_NUMA_BALANCING + select USE_PERCPU_NUMA_NODE_ID + select NEED_PER_CPU_EMBED_FIRST_CHUNK help Enable NUMA (Non-Uniform Memory Access) support. @@ -348,14 +323,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y - depends on NUMA - config RISCV_ISA_C bool "Emit compressed instructions when building Linux" default y @@ -396,12 +363,25 @@ source "kernel/Kconfig.hz" config RISCV_SBI_V01 bool "SBI v0.1 support" - default y depends on RISCV_SBI help This config allows kernel to use SBI v0.1 APIs. This will be deprecated in future once legacy M-mode software are no longer in use. +config RISCV_BOOT_SPINWAIT + bool "Spinwait booting method" + depends on SMP + default y + help + This enables support for booting Linux via spinwait method. In the + spinwait method, all cores randomly jump to Linux. One of the cores + gets chosen via lottery and all other keep spinning on a percpu + variable. This method cannot support CPU hotplug and sparse hartid + scheme. It should be only enabled for M-mode Linux or platforms relying + on older firmware without SBI HSM extension. All other platforms should + rely on ordered booting via SBI HSM extension which gets chosen + dynamically at runtime if the firmware supports it. + config KEXEC bool "Kexec system call" select KEXEC_CORE diff --git a/arch/riscv/boot/dts/canaan/Makefile b/arch/riscv/boot/dts/canaan/Makefile index 9ee7156c0c31..c61b08ac8554 100644 --- a/arch/riscv/boot/dts/canaan/Makefile +++ b/arch/riscv/boot/dts/canaan/Makefile @@ -1,5 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -ifneq ($(CONFIG_SOC_CANAAN_K210_DTB_SOURCE),"") -dtb-y += $(strip $(shell echo $(CONFIG_SOC_CANAAN_K210_DTB_SOURCE))).dtb +dtb-$(CONFIG_SOC_CANAAN_K210_DTB_BUILTIN) += $(addsuffix .dtb, $(CONFIG_SOC_CANAAN_K210_DTB_SOURCE)) obj-$(CONFIG_SOC_CANAAN_K210_DTB_BUILTIN) += $(addsuffix .o, $(dtb-y)) -endif diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi index 5e8ca8142482..56f57118c633 100644 --- a/arch/riscv/boot/dts/canaan/k210.dtsi +++ b/arch/riscv/boot/dts/canaan/k210.dtsi @@ -103,8 +103,8 @@ clint0: timer@2000000 { compatible = "canaan,k210-clint", "sifive,clint0"; reg = <0x2000000 0xC000>; - interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7 - &cpu1_intc 3 &cpu1_intc 7>; + interrupts-extended = <&cpu0_intc 3>, <&cpu0_intc 7>, + <&cpu1_intc 3>, <&cpu1_intc 7>; }; plic0: interrupt-controller@c000000 { @@ -113,7 +113,7 @@ compatible = "canaan,k210-plic", "sifive,plic-1.0.0"; reg = <0xC000000 0x4000000>; interrupt-controller; - interrupts-extended = <&cpu0_intc 11 &cpu1_intc 11>; + interrupts-extended = <&cpu0_intc 11>, <&cpu1_intc 11>; riscv,ndev = <65>; }; @@ -130,10 +130,11 @@ compatible = "canaan,k210-gpiohs", "sifive,gpio0"; reg = <0x38001000 0x1000>; interrupt-controller; - interrupts = <34 35 36 37 38 39 40 41 - 42 43 44 45 46 47 48 49 - 50 51 52 53 54 55 56 57 - 58 59 60 61 62 63 64 65>; + interrupts = <34>, <35>, <36>, <37>, <38>, <39>, <40>, + <41>, <42>, <43>, <44>, <45>, <46>, <47>, + <48>, <49>, <50>, <51>, <52>, <53>, <54>, + <55>, <56>, <57>, <58>, <59>, <60>, <61>, + <62>, <63>, <64>, <65>; gpio-controller; ngpios = <32>; }; @@ -141,7 +142,7 @@ dmac0: dma-controller@50000000 { compatible = "snps,axi-dma-1.01a"; reg = <0x50000000 0x1000>; - interrupts = <27 28 29 30 31 32>; + interrupts = <27>, <28>, <29>, <30>, <31>, <32>; #dma-cells = <1>; clocks = <&sysclk K210_CLK_DMA>, <&sysclk K210_CLK_DMA>; clock-names = "core-clk", "cfgr-clk"; @@ -316,7 +317,7 @@ timer0: timer@502d0000 { compatible = "snps,dw-apb-timer"; reg = <0x502D0000 0x100>; - interrupts = <14 15>; + interrupts = <14>, <15>; clocks = <&sysclk K210_CLK_TIMER0>, <&sysclk K210_CLK_APB0>; clock-names = "timer", "pclk"; @@ -326,7 +327,7 @@ timer1: timer@502e0000 { compatible = "snps,dw-apb-timer"; reg = <0x502E0000 0x100>; - interrupts = <16 17>; + interrupts = <16>, <17>; clocks = <&sysclk K210_CLK_TIMER1>, <&sysclk K210_CLK_APB0>; clock-names = "timer", "pclk"; @@ -336,7 +337,7 @@ timer2: timer@502f0000 { compatible = "snps,dw-apb-timer"; reg = <0x502F0000 0x100>; - interrupts = <18 19>; + interrupts = <18>, <19>; clocks = <&sysclk K210_CLK_TIMER2>, <&sysclk K210_CLK_APB0>; clock-names = "timer", "pclk"; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts index 0bcaf35045e7..984872f3d3a9 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts @@ -199,7 +199,7 @@ }; &spi3 { - spi-flash@0 { + flash@0 { compatible = "jedec,spi-nor"; reg = <0>; spi-max-frequency = <50000000>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts index ac8a03f5867a..7ba99b4da304 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts @@ -201,7 +201,7 @@ }; &spi3 { - spi-flash@0 { + flash@0 { compatible = "jedec,spi-nor"; reg = <0>; spi-max-frequency = <50000000>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts index 623998194bc1..be9b12c9b374 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts @@ -209,7 +209,7 @@ }; &spi3 { - spi-flash@0 { + flash@0 { compatible = "jedec,spi-nor"; reg = <0>; spi-max-frequency = <50000000>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts b/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts index cf605ba0d67e..031c0c28f819 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts @@ -174,7 +174,7 @@ }; &spi3 { - spi-flash@0 { + flash@0 { compatible = "jedec,spi-nor"; reg = <0>; spi-max-frequency = <50000000>; diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts index fc1e5869df1b..0c748ae1b006 100644 --- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts @@ -35,6 +35,10 @@ }; }; +&refclk { + clock-frequency = <600000000>; +}; + &serial0 { status = "okay"; }; diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi index c9f6d205d2ba..869aaf0d5c06 100644 --- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi @@ -9,9 +9,6 @@ model = "Microchip PolarFire SoC"; compatible = "microchip,mpfs"; - chosen { - }; - cpus { #address-cells = <1>; #size-cells = <0>; @@ -142,6 +139,11 @@ }; }; + refclk: msspllclk { + compatible = "fixed-clock"; + #clock-cells = <0>; + }; + soc { #address-cells = <2>; #size-cells = <2>; @@ -156,62 +158,48 @@ cache-size = <2097152>; cache-unified; interrupt-parent = <&plic>; - interrupts = <1 2 3>; + interrupts = <1>, <2>, <3>; reg = <0x0 0x2010000 0x0 0x1000>; }; clint@2000000 { compatible = "sifive,fu540-c000-clint", "sifive,clint0"; reg = <0x0 0x2000000 0x0 0xC000>; - interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7 - &cpu1_intc 3 &cpu1_intc 7 - &cpu2_intc 3 &cpu2_intc 7 - &cpu3_intc 3 &cpu3_intc 7 - &cpu4_intc 3 &cpu4_intc 7>; + interrupts-extended = <&cpu0_intc 3>, <&cpu0_intc 7>, + <&cpu1_intc 3>, <&cpu1_intc 7>, + <&cpu2_intc 3>, <&cpu2_intc 7>, + <&cpu3_intc 3>, <&cpu3_intc 7>, + <&cpu4_intc 3>, <&cpu4_intc 7>; }; plic: interrupt-controller@c000000 { - #interrupt-cells = <1>; compatible = "sifive,fu540-c000-plic", "sifive,plic-1.0.0"; reg = <0x0 0xc000000 0x0 0x4000000>; - riscv,ndev = <186>; + #address-cells = <0>; + #interrupt-cells = <1>; interrupt-controller; - interrupts-extended = <&cpu0_intc 11 - &cpu1_intc 11 &cpu1_intc 9 - &cpu2_intc 11 &cpu2_intc 9 - &cpu3_intc 11 &cpu3_intc 9 - &cpu4_intc 11 &cpu4_intc 9>; + interrupts-extended = <&cpu0_intc 11>, + <&cpu1_intc 11>, <&cpu1_intc 9>, + <&cpu2_intc 11>, <&cpu2_intc 9>, + <&cpu3_intc 11>, <&cpu3_intc 9>, + <&cpu4_intc 11>, <&cpu4_intc 9>; + riscv,ndev = <186>; }; dma@3000000 { compatible = "sifive,fu540-c000-pdma"; reg = <0x0 0x3000000 0x0 0x8000>; interrupt-parent = <&plic>; - interrupts = <23 24 25 26 27 28 29 30>; + interrupts = <23>, <24>, <25>, <26>, <27>, <28>, <29>, + <30>; #dma-cells = <1>; }; - refclk: refclk { - compatible = "fixed-clock"; - #clock-cells = <0>; - clock-frequency = <600000000>; - clock-output-names = "msspllclk"; - }; - clkcfg: clkcfg@20002000 { compatible = "microchip,mpfs-clkcfg"; reg = <0x0 0x20002000 0x0 0x1000>; - reg-names = "mss_sysreg"; clocks = <&refclk>; #clock-cells = <1>; - clock-output-names = "cpu", "axi", "ahb", "envm", /* 0-3 */ - "mac0", "mac1", "mmc", "timer", /* 4-7 */ - "mmuart0", "mmuart1", "mmuart2", "mmuart3", /* 8-11 */ - "mmuart4", "spi0", "spi1", "i2c0", /* 12-15 */ - "i2c1", "can0", "can1", "usb", /* 16-19 */ - "rsvd", "rtc", "qspi", "gpio0", /* 20-23 */ - "gpio1", "gpio2", "ddrc", "fic0", /* 24-27 */ - "fic1", "fic2", "fic3", "athena", "cfm"; /* 28-32 */ }; serial0: serial@20000000 { @@ -267,7 +255,7 @@ compatible = "microchip,mpfs-sd4hc", "cdns,sd4hc"; reg = <0x0 0x20008000 0x0 0x1000>; interrupt-parent = <&plic>; - interrupts = <88 89>; + interrupts = <88>, <89>; clocks = <&clkcfg 6>; max-frequency = <200000000>; status = "disabled"; @@ -277,7 +265,7 @@ compatible = "cdns,macb"; reg = <0x0 0x20110000 0x0 0x2000>; interrupt-parent = <&plic>; - interrupts = <64 65 66 67>; + interrupts = <64>, <65>, <66>, <67>; local-mac-address = [00 00 00 00 00 00]; clocks = <&clkcfg 4>, <&clkcfg 2>; clock-names = "pclk", "hclk"; @@ -290,7 +278,7 @@ compatible = "cdns,macb"; reg = <0x0 0x20112000 0x0 0x2000>; interrupt-parent = <&plic>; - interrupts = <70 71 72 73>; + interrupts = <70>, <71>, <72>, <73>; local-mac-address = [00 00 00 00 00 00]; clocks = <&clkcfg 5>, <&clkcfg 2>; status = "disabled"; diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index 0655b5c4201d..3eef52b1a59b 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -137,20 +137,21 @@ soc { #address-cells = <2>; #size-cells = <2>; - compatible = "sifive,fu540-c000", "sifive,fu540", "simple-bus"; + compatible = "simple-bus"; ranges; plic0: interrupt-controller@c000000 { - #interrupt-cells = <1>; compatible = "sifive,fu540-c000-plic", "sifive,plic-1.0.0"; reg = <0x0 0xc000000 0x0 0x4000000>; - riscv,ndev = <53>; + #address-cells = <0>; + #interrupt-cells = <1>; interrupt-controller; - interrupts-extended = < - &cpu0_intc 0xffffffff - &cpu1_intc 0xffffffff &cpu1_intc 9 - &cpu2_intc 0xffffffff &cpu2_intc 9 - &cpu3_intc 0xffffffff &cpu3_intc 9 - &cpu4_intc 0xffffffff &cpu4_intc 9>; + interrupts-extended = + <&cpu0_intc 0xffffffff>, + <&cpu1_intc 0xffffffff>, <&cpu1_intc 9>, + <&cpu2_intc 0xffffffff>, <&cpu2_intc 9>, + <&cpu3_intc 0xffffffff>, <&cpu3_intc 9>, + <&cpu4_intc 0xffffffff>, <&cpu4_intc 9>; + riscv,ndev = <53>; }; prci: clock-controller@10000000 { compatible = "sifive,fu540-c000-prci"; @@ -170,7 +171,8 @@ compatible = "sifive,fu540-c000-pdma"; reg = <0x0 0x3000000 0x0 0x8000>; interrupt-parent = <&plic0>; - interrupts = <23 24 25 26 27 28 29 30>; + interrupts = <23>, <24>, <25>, <26>, <27>, <28>, <29>, + <30>; #dma-cells = <1>; }; uart1: serial@10011000 { @@ -195,8 +197,8 @@ }; qspi0: spi@10040000 { compatible = "sifive,fu540-c000-spi", "sifive,spi0"; - reg = <0x0 0x10040000 0x0 0x1000 - 0x0 0x20000000 0x0 0x10000000>; + reg = <0x0 0x10040000 0x0 0x1000>, + <0x0 0x20000000 0x0 0x10000000>; interrupt-parent = <&plic0>; interrupts = <51>; clocks = <&prci PRCI_CLK_TLCLK>; @@ -206,8 +208,8 @@ }; qspi1: spi@10041000 { compatible = "sifive,fu540-c000-spi", "sifive,spi0"; - reg = <0x0 0x10041000 0x0 0x1000 - 0x0 0x30000000 0x0 0x10000000>; + reg = <0x0 0x10041000 0x0 0x1000>, + <0x0 0x30000000 0x0 0x10000000>; interrupt-parent = <&plic0>; interrupts = <52>; clocks = <&prci PRCI_CLK_TLCLK>; @@ -229,8 +231,8 @@ compatible = "sifive,fu540-c000-gem"; interrupt-parent = <&plic0>; interrupts = <53>; - reg = <0x0 0x10090000 0x0 0x2000 - 0x0 0x100a0000 0x0 0x1000>; + reg = <0x0 0x10090000 0x0 0x2000>, + <0x0 0x100a0000 0x0 0x1000>; local-mac-address = [00 00 00 00 00 00]; clock-names = "pclk", "hclk"; clocks = <&prci PRCI_CLK_GEMGXLPLL>, @@ -243,7 +245,7 @@ compatible = "sifive,fu540-c000-pwm", "sifive,pwm0"; reg = <0x0 0x10020000 0x0 0x1000>; interrupt-parent = <&plic0>; - interrupts = <42 43 44 45>; + interrupts = <42>, <43>, <44>, <45>; clocks = <&prci PRCI_CLK_TLCLK>; #pwm-cells = <3>; status = "disabled"; @@ -252,7 +254,7 @@ compatible = "sifive,fu540-c000-pwm", "sifive,pwm0"; reg = <0x0 0x10021000 0x0 0x1000>; interrupt-parent = <&plic0>; - interrupts = <46 47 48 49>; + interrupts = <46>, <47>, <48>, <49>; clocks = <&prci PRCI_CLK_TLCLK>; #pwm-cells = <3>; status = "disabled"; @@ -265,7 +267,7 @@ cache-size = <2097152>; cache-unified; interrupt-parent = <&plic0>; - interrupts = <1 2 3>; + interrupts = <1>, <2>, <3>; reg = <0x0 0x2010000 0x0 0x1000>; }; gpio: gpio@10060000 { diff --git a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi index abbb960f90a0..8464b0e3c887 100644 --- a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi @@ -147,12 +147,12 @@ reg = <0x0 0xc000000 0x0 0x4000000>; riscv,ndev = <69>; interrupt-controller; - interrupts-extended = < - &cpu0_intc 0xffffffff - &cpu1_intc 0xffffffff &cpu1_intc 9 - &cpu2_intc 0xffffffff &cpu2_intc 9 - &cpu3_intc 0xffffffff &cpu3_intc 9 - &cpu4_intc 0xffffffff &cpu4_intc 9>; + interrupts-extended = + <&cpu0_intc 0xffffffff>, + <&cpu1_intc 0xffffffff>, <&cpu1_intc 9>, + <&cpu2_intc 0xffffffff>, <&cpu2_intc 9>, + <&cpu3_intc 0xffffffff>, <&cpu3_intc 9>, + <&cpu4_intc 0xffffffff>, <&cpu4_intc 9>; }; prci: clock-controller@10000000 { compatible = "sifive,fu740-c000-prci"; @@ -273,7 +273,7 @@ cache-size = <2097152>; cache-unified; interrupt-parent = <&plic0>; - interrupts = <19 21 22 20>; + interrupts = <19>, <21>, <22>, <20>; reg = <0x0 0x2010000 0x0 0x1000>; }; gpio: gpio@10060000 { diff --git a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts index 6bfa1f24d3de..c4ed9efdff03 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts @@ -39,6 +39,11 @@ clock-frequency = <RTCCLK_FREQ>; clock-output-names = "rtcclk"; }; + + gpio-poweroff { + compatible = "gpio-poweroff"; + gpios = <&gpio 2 GPIO_ACTIVE_LOW>; + }; }; &uart0 { diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index ef473e2f503b..f120fcc43d0a 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -2,6 +2,7 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y @@ -13,10 +14,10 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_BPF_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_SOC_MICROCHIP_POLARFIRE=y CONFIG_SOC_SIFIVE=y CONFIG_SOC_VIRT=y -CONFIG_SOC_MICROCHIP_POLARFIRE=y CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y CONFIG_VIRTUALIZATION=y @@ -70,14 +71,14 @@ CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_VIRTIO=y CONFIG_SPI=y CONFIG_SPI_SIFIVE=y +# CONFIG_PTP_1588_CLOCK is not set CONFIG_GPIOLIB=y CONFIG_GPIO_SIFIVE=y -# CONFIG_PTP_1588_CLOCK is not set -CONFIG_POWER_RESET=y CONFIG_DRM=m CONFIG_DRM_RADEON=m CONFIG_DRM_NOUVEAU=m CONFIG_DRM_VIRTIO_GPU=m +CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_USB=y CONFIG_USB_XHCI_HCD=y @@ -88,10 +89,10 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_HCD_PLATFORM=y CONFIG_USB_STORAGE=y CONFIG_USB_UAS=y +CONFIG_MMC=y CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_SDHCI_CADENCE=y -CONFIG_MMC=y CONFIG_MMC_SPI=y CONFIG_RTC_CLASS=y CONFIG_VIRTIO_PCI=y @@ -142,5 +143,3 @@ CONFIG_RCU_EQS_DEBUG=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_EFI=y diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig index b16a2a12c82a..3f42ed87dde8 100644 --- a/arch/riscv/configs/nommu_k210_defconfig +++ b/arch/riscv/configs/nommu_k210_defconfig @@ -29,8 +29,6 @@ CONFIG_EMBEDDED=y CONFIG_SLOB=y # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y -CONFIG_SOC_CANAAN_K210_DTB_SOURCE="k210_generic" -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_CMDLINE="earlycon console=ttySIF0" @@ -75,7 +73,6 @@ CONFIG_LEDS_GPIO=y CONFIG_LEDS_USER=y # CONFIG_VIRTIO_MENU is not set # CONFIG_VHOST_MENU is not set -# CONFIG_SURFACE_PLATFORMS is not set # CONFIG_FILE_LOCKING is not set # CONFIG_DNOTIFY is not set # CONFIG_INOTIFY_USER is not set diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig index 61f887f65419..2a82a3b2992b 100644 --- a/arch/riscv/configs/nommu_k210_sdcard_defconfig +++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig @@ -21,8 +21,6 @@ CONFIG_EMBEDDED=y CONFIG_SLOB=y # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y -CONFIG_SOC_CANAAN_K210_DTB_SOURCE="k210_generic" -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_CMDLINE="earlycon console=ttySIF0 rootdelay=2 root=/dev/mmcblk0p1 ro" @@ -30,7 +28,6 @@ CONFIG_CMDLINE_FORCE=y # CONFIG_SECCOMP is not set # CONFIG_STACKPROTECTOR is not set # CONFIG_GCC_PLUGINS is not set -# CONFIG_BLK_DEV_BSG is not set # CONFIG_MQ_IOSCHED_DEADLINE is not set # CONFIG_MQ_IOSCHED_KYBER is not set CONFIG_BINFMT_FLAT=y @@ -72,7 +69,6 @@ CONFIG_LEDS_GPIO=y CONFIG_LEDS_USER=y # CONFIG_VIRTIO_MENU is not set # CONFIG_VHOST_MENU is not set -# CONFIG_SURFACE_PLATFORMS is not set CONFIG_EXT2_FS=y # CONFIG_FILE_LOCKING is not set # CONFIG_DNOTIFY is not set diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig index e046a0babde4..e1c9864b6237 100644 --- a/arch/riscv/configs/nommu_virt_defconfig +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -24,15 +24,12 @@ CONFIG_EXPERT=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_SLOB=y -# CONFIG_SLAB_MERGE_DEFAULT is not set # CONFIG_MMU is not set CONFIG_SOC_VIRT=y -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_CMDLINE="root=/dev/vda rw earlycon=uart8250,mmio,0x10000000,115200n8 console=ttyS0" CONFIG_CMDLINE_FORCE=y CONFIG_JUMP_LABEL=y -# CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set # CONFIG_EFI_PARTITION is not set diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig index 6e9f12ff968a..8b56a7f1eb06 100644 --- a/arch/riscv/configs/rv32_defconfig +++ b/arch/riscv/configs/rv32_defconfig @@ -2,6 +2,7 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y @@ -13,7 +14,7 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_BPF_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set CONFIG_SOC_SIFIVE=y CONFIG_SOC_VIRT=y CONFIG_ARCH_RV32I=y @@ -69,10 +70,10 @@ CONFIG_HW_RANDOM_VIRTIO=y CONFIG_SPI=y CONFIG_SPI_SIFIVE=y # CONFIG_PTP_1588_CLOCK is not set -CONFIG_POWER_RESET=y CONFIG_DRM=y CONFIG_DRM_RADEON=y CONFIG_DRM_VIRTIO_GPU=y +CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_USB=y CONFIG_USB_XHCI_HCD=y @@ -132,4 +133,3 @@ CONFIG_RCU_EQS_DEBUG=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y -# CONFIG_SYSFS_SYSCALL is not set diff --git a/arch/riscv/errata/alternative.c b/arch/riscv/errata/alternative.c index 3b15885db70b..e8b4a0fe488c 100644 --- a/arch/riscv/errata/alternative.c +++ b/arch/riscv/errata/alternative.c @@ -22,7 +22,8 @@ static struct cpu_manufacturer_info_t { } cpu_mfr_info; static void (*vendor_patch_func)(struct alt_entry *begin, struct alt_entry *end, - unsigned long archid, unsigned long impid); + unsigned long archid, + unsigned long impid) __initdata; static inline void __init riscv_fill_cpu_mfr_info(void) { diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 445ccc97305a..57b86fd9916c 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += early_ioremap.h -generic-y += extable.h generic-y += flat.h generic-y += kvm_para.h generic-y += user.h diff --git a/arch/riscv/include/asm/asm-extable.h b/arch/riscv/include/asm/asm-extable.h new file mode 100644 index 000000000000..14be0673f5b5 --- /dev/null +++ b/arch/riscv/include/asm/asm-extable.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_ASM_EXTABLE_H +#define __ASM_ASM_EXTABLE_H + +#define EX_TYPE_NONE 0 +#define EX_TYPE_FIXUP 1 +#define EX_TYPE_BPF 2 +#define EX_TYPE_UACCESS_ERR_ZERO 3 + +#ifdef __ASSEMBLY__ + +#define __ASM_EXTABLE_RAW(insn, fixup, type, data) \ + .pushsection __ex_table, "a"; \ + .balign 4; \ + .long ((insn) - .); \ + .long ((fixup) - .); \ + .short (type); \ + .short (data); \ + .popsection; + + .macro _asm_extable, insn, fixup + __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_FIXUP, 0) + .endm + +#else /* __ASSEMBLY__ */ + +#include <linux/bits.h> +#include <linux/stringify.h> +#include <asm/gpr-num.h> + +#define __ASM_EXTABLE_RAW(insn, fixup, type, data) \ + ".pushsection __ex_table, \"a\"\n" \ + ".balign 4\n" \ + ".long ((" insn ") - .)\n" \ + ".long ((" fixup ") - .)\n" \ + ".short (" type ")\n" \ + ".short (" data ")\n" \ + ".popsection\n" + +#define _ASM_EXTABLE(insn, fixup) \ + __ASM_EXTABLE_RAW(#insn, #fixup, __stringify(EX_TYPE_FIXUP), "0") + +#define EX_DATA_REG_ERR_SHIFT 0 +#define EX_DATA_REG_ERR GENMASK(4, 0) +#define EX_DATA_REG_ZERO_SHIFT 5 +#define EX_DATA_REG_ZERO GENMASK(9, 5) + +#define EX_DATA_REG(reg, gpr) \ + "((.L__gpr_num_" #gpr ") << " __stringify(EX_DATA_REG_##reg##_SHIFT) ")" + +#define _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, zero) \ + __DEFINE_ASM_GPR_NUMS \ + __ASM_EXTABLE_RAW(#insn, #fixup, \ + __stringify(EX_TYPE_UACCESS_ERR_ZERO), \ + "(" \ + EX_DATA_REG(ERR, err) " | " \ + EX_DATA_REG(ZERO, zero) \ + ")") + +#define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) \ + _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, zero) + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_ASM_EXTABLE_H */ diff --git a/arch/riscv/include/asm/cpu_ops.h b/arch/riscv/include/asm/cpu_ops.h index a8ec3c5c1bd2..134590f1b843 100644 --- a/arch/riscv/include/asm/cpu_ops.h +++ b/arch/riscv/include/asm/cpu_ops.h @@ -40,7 +40,5 @@ struct cpu_operations { extern const struct cpu_operations *cpu_ops[NR_CPUS]; void __init cpu_set_ops(int cpu); -void cpu_update_secondary_bootdata(unsigned int cpuid, - struct task_struct *tidle); #endif /* ifndef __ASM_CPU_OPS_H */ diff --git a/arch/riscv/include/asm/cpu_ops_sbi.h b/arch/riscv/include/asm/cpu_ops_sbi.h new file mode 100644 index 000000000000..56e4b76d09ff --- /dev/null +++ b/arch/riscv/include/asm/cpu_ops_sbi.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021 by Rivos Inc. + */ +#ifndef __ASM_CPU_OPS_SBI_H +#define __ASM_CPU_OPS_SBI_H + +#ifndef __ASSEMBLY__ +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/threads.h> + +/** + * struct sbi_hart_boot_data - Hart specific boot used during booting and + * cpu hotplug. + * @task_ptr: A pointer to the hart specific tp + * @stack_ptr: A pointer to the hart specific sp + */ +struct sbi_hart_boot_data { + void *task_ptr; + void *stack_ptr; +}; +#endif + +#endif /* ifndef __ASM_CPU_OPS_SBI_H */ diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 5046f431645c..ae711692eec9 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -40,14 +40,13 @@ #ifndef CONFIG_64BIT #define SATP_PPN _AC(0x003FFFFF, UL) #define SATP_MODE_32 _AC(0x80000000, UL) -#define SATP_MODE SATP_MODE_32 #define SATP_ASID_BITS 9 #define SATP_ASID_SHIFT 22 #define SATP_ASID_MASK _AC(0x1FF, UL) #else #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) #define SATP_MODE_39 _AC(0x8000000000000000, UL) -#define SATP_MODE SATP_MODE_39 +#define SATP_MODE_48 _AC(0x9000000000000000, UL) #define SATP_ASID_BITS 16 #define SATP_ASID_SHIFT 44 #define SATP_ASID_MASK _AC(0xFFFF, UL) diff --git a/arch/riscv/include/asm/extable.h b/arch/riscv/include/asm/extable.h new file mode 100644 index 000000000000..512012d193dc --- /dev/null +++ b/arch/riscv/include/asm/extable.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_EXTABLE_H +#define _ASM_RISCV_EXTABLE_H + +/* + * The exception table consists of pairs of relative offsets: the first + * is the relative offset to an instruction that is allowed to fault, + * and the second is the relative offset at which the program should + * continue. No registers are modified, so it is entirely up to the + * continuation code to figure out what to do. + * + * All the routines below use bits of fixup code that are out of line + * with the main instruction path. This means when everything is well, + * we don't even have to jump over them. Further, they do not intrude + * on our cache or tlb entries. + */ + +struct exception_table_entry { + int insn, fixup; + short type, data; +}; + +#define ARCH_HAS_RELATIVE_EXTABLE + +#define swap_ex_entry_fixup(a, b, tmp, delta) \ +do { \ + (a)->fixup = (b)->fixup + (delta); \ + (b)->fixup = (tmp).fixup - (delta); \ + (a)->type = (b)->type; \ + (b)->type = (tmp).type; \ + (a)->data = (b)->data; \ + (b)->data = (tmp).data; \ +} while (0) + +bool fixup_exception(struct pt_regs *regs); + +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_ARCH_RV64I) +bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs); +#else +static inline bool +ex_handler_bpf(const struct exception_table_entry *ex, + struct pt_regs *regs) +{ + return false; +} +#endif + +#endif diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 54cbf07fb4e9..58a718573ad6 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -24,6 +24,7 @@ enum fixed_addresses { FIX_HOLE, FIX_PTE, FIX_PMD, + FIX_PUD, FIX_TEXT_POKE1, FIX_TEXT_POKE0, FIX_EARLYCON_MEM_BASE, diff --git a/arch/riscv/include/asm/futex.h b/arch/riscv/include/asm/futex.h index 1b00badb9f87..fc8130f995c1 100644 --- a/arch/riscv/include/asm/futex.h +++ b/arch/riscv/include/asm/futex.h @@ -11,6 +11,7 @@ #include <linux/uaccess.h> #include <linux/errno.h> #include <asm/asm.h> +#include <asm/asm-extable.h> /* We don't even really need the extable code, but for now keep it simple */ #ifndef CONFIG_MMU @@ -20,23 +21,14 @@ #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ { \ - uintptr_t tmp; \ __enable_user_access(); \ __asm__ __volatile__ ( \ "1: " insn " \n" \ "2: \n" \ - " .section .fixup,\"ax\" \n" \ - " .balign 4 \n" \ - "3: li %[r],%[e] \n" \ - " jump 2b,%[t] \n" \ - " .previous \n" \ - " .section __ex_table,\"a\" \n" \ - " .balign " RISCV_SZPTR " \n" \ - " " RISCV_PTR " 1b, 3b \n" \ - " .previous \n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %[r]) \ : [r] "+r" (ret), [ov] "=&r" (oldval), \ - [u] "+m" (*uaddr), [t] "=&r" (tmp) \ - : [op] "Jr" (oparg), [e] "i" (-EFAULT) \ + [u] "+m" (*uaddr) \ + : [op] "Jr" (oparg) \ : "memory"); \ __disable_user_access(); \ } @@ -98,18 +90,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, "2: sc.w.aqrl %[t],%z[nv],%[u] \n" " bnez %[t],1b \n" "3: \n" - " .section .fixup,\"ax\" \n" - " .balign 4 \n" - "4: li %[r],%[e] \n" - " jump 3b,%[t] \n" - " .previous \n" - " .section __ex_table,\"a\" \n" - " .balign " RISCV_SZPTR " \n" - " " RISCV_PTR " 1b, 4b \n" - " " RISCV_PTR " 2b, 4b \n" - " .previous \n" + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %[r]) \ + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %[r]) \ : [r] "+r" (ret), [v] "=&r" (val), [u] "+m" (*uaddr), [t] "=&r" (tmp) - : [ov] "Jr" (oldval), [nv] "Jr" (newval), [e] "i" (-EFAULT) + : [ov] "Jr" (oldval), [nv] "Jr" (newval) : "memory"); __disable_user_access(); diff --git a/arch/riscv/include/asm/gpr-num.h b/arch/riscv/include/asm/gpr-num.h new file mode 100644 index 000000000000..dfee2829fc7c --- /dev/null +++ b/arch/riscv/include/asm/gpr-num.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_GPR_NUM_H +#define __ASM_GPR_NUM_H + +#ifdef __ASSEMBLY__ + .equ .L__gpr_num_zero, 0 + .equ .L__gpr_num_ra, 1 + .equ .L__gpr_num_sp, 2 + .equ .L__gpr_num_gp, 3 + .equ .L__gpr_num_tp, 4 + .equ .L__gpr_num_t0, 5 + .equ .L__gpr_num_t1, 6 + .equ .L__gpr_num_t2, 7 + .equ .L__gpr_num_s0, 8 + .equ .L__gpr_num_s1, 9 + .equ .L__gpr_num_a0, 10 + .equ .L__gpr_num_a1, 11 + .equ .L__gpr_num_a2, 12 + .equ .L__gpr_num_a3, 13 + .equ .L__gpr_num_a4, 14 + .equ .L__gpr_num_a5, 15 + .equ .L__gpr_num_a6, 16 + .equ .L__gpr_num_a7, 17 + .equ .L__gpr_num_s2, 18 + .equ .L__gpr_num_s3, 19 + .equ .L__gpr_num_s4, 20 + .equ .L__gpr_num_s5, 21 + .equ .L__gpr_num_s6, 22 + .equ .L__gpr_num_s7, 23 + .equ .L__gpr_num_s8, 24 + .equ .L__gpr_num_s9, 25 + .equ .L__gpr_num_s10, 26 + .equ .L__gpr_num_s11, 27 + .equ .L__gpr_num_t3, 28 + .equ .L__gpr_num_t4, 29 + .equ .L__gpr_num_t5, 30 + .equ .L__gpr_num_t6, 31 + +#else /* __ASSEMBLY__ */ + +#define __DEFINE_ASM_GPR_NUMS \ +" .equ .L__gpr_num_zero, 0\n" \ +" .equ .L__gpr_num_ra, 1\n" \ +" .equ .L__gpr_num_sp, 2\n" \ +" .equ .L__gpr_num_gp, 3\n" \ +" .equ .L__gpr_num_tp, 4\n" \ +" .equ .L__gpr_num_t0, 5\n" \ +" .equ .L__gpr_num_t1, 6\n" \ +" .equ .L__gpr_num_t2, 7\n" \ +" .equ .L__gpr_num_s0, 8\n" \ +" .equ .L__gpr_num_s1, 9\n" \ +" .equ .L__gpr_num_a0, 10\n" \ +" .equ .L__gpr_num_a1, 11\n" \ +" .equ .L__gpr_num_a2, 12\n" \ +" .equ .L__gpr_num_a3, 13\n" \ +" .equ .L__gpr_num_a4, 14\n" \ +" .equ .L__gpr_num_a5, 15\n" \ +" .equ .L__gpr_num_a6, 16\n" \ +" .equ .L__gpr_num_a7, 17\n" \ +" .equ .L__gpr_num_s2, 18\n" \ +" .equ .L__gpr_num_s3, 19\n" \ +" .equ .L__gpr_num_s4, 20\n" \ +" .equ .L__gpr_num_s5, 21\n" \ +" .equ .L__gpr_num_s6, 22\n" \ +" .equ .L__gpr_num_s7, 23\n" \ +" .equ .L__gpr_num_s8, 24\n" \ +" .equ .L__gpr_num_s9, 25\n" \ +" .equ .L__gpr_num_s10, 26\n" \ +" .equ .L__gpr_num_s11, 27\n" \ +" .equ .L__gpr_num_t3, 28\n" \ +" .equ .L__gpr_num_t4, 29\n" \ +" .equ .L__gpr_num_t5, 30\n" \ +" .equ .L__gpr_num_t6, 31\n" + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_GPR_NUM_H */ diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index b00f503ec124..0b85e363e778 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -27,13 +27,18 @@ */ #define KASAN_SHADOW_SCALE_SHIFT 3 -#define KASAN_SHADOW_SIZE (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) -#define KASAN_SHADOW_START KERN_VIRT_START -#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) +#define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) +/* + * Depending on the size of the virtual address space, the region may not be + * aligned on PGDIR_SIZE, so force its alignment to ease its population. + */ +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) +#define KASAN_SHADOW_END MODULES_LOWEST_VADDR #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) void kasan_init(void); asmlinkage void kasan_early_init(void); +void kasan_swapper_init(void); #endif #endif diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index b3e5ff0125fe..160e3a1e8f8b 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -31,9 +31,20 @@ * When not using MMU this corresponds to the first free page in * physical memory (aligned on a page boundary). */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_MMU +#define PAGE_OFFSET kernel_map.page_offset +#else #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) - -#define KERN_VIRT_SIZE (-PAGE_OFFSET) +#endif +/* + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so + * define the PAGE_OFFSET value for SV39. + */ +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) +#else +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#endif /* CONFIG_64BIT */ #ifndef __ASSEMBLY__ @@ -86,6 +97,7 @@ extern unsigned long riscv_pfn_base; #endif /* CONFIG_MMU */ struct kernel_mapping { + unsigned long page_offset; unsigned long virt_addr; uintptr_t phys_addr; uintptr_t size; diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 0af6933a7100..11823004b87a 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -11,6 +11,8 @@ #include <asm/tlb.h> #ifdef CONFIG_MMU +#define __HAVE_ARCH_PUD_ALLOC_ONE +#define __HAVE_ARCH_PUD_FREE #include <asm-generic/pgalloc.h> static inline void pmd_populate_kernel(struct mm_struct *mm, @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); } + +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + if (pgtable_l4_enabled) { + unsigned long pfn = virt_to_pfn(pud); + + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); + } +} + +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, + pud_t *pud) +{ + if (pgtable_l4_enabled) { + unsigned long pfn = virt_to_pfn(pud); + + set_p4d_safe(p4d, + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); + } +} + +#define pud_alloc_one pud_alloc_one +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + if (pgtable_l4_enabled) + return __pud_alloc_one(mm, addr); + + return NULL; +} + +#define pud_free pud_free +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + if (pgtable_l4_enabled) + __pud_free(mm, pud); +} + +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) #endif /* __PAGETABLE_PMD_FOLDED */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 228261aa9628..bbbdd66e5e2f 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -8,16 +8,36 @@ #include <linux/const.h> -#define PGDIR_SHIFT 30 +extern bool pgtable_l4_enabled; + +#define PGDIR_SHIFT_L3 30 +#define PGDIR_SHIFT_L4 39 +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) + +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) /* Size of region mapped by a page global directory */ #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) +/* pud is folded into pgd in case of 3-level page table */ +#define PUD_SHIFT 30 +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) + #define PMD_SHIFT 21 /* Size of region mapped by a page middle directory */ #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE - 1)) +/* Page Upper Directory entry */ +typedef struct { + unsigned long pud; +} pud_t; + +#define pud_val(x) ((x).pud) +#define __pud(x) ((pud_t) { (x) }) +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) + /* Page Middle Directory entry */ typedef struct { unsigned long pmd; @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) +{ + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); +} + +static inline unsigned long _pud_pfn(pud_t pud) +{ + return pud_val(pud) >> _PAGE_PFN_SHIFT; +} + static inline pmd_t *pud_pgtable(pud_t pud) { return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); } +#define mm_pud_folded mm_pud_folded +static inline bool mm_pud_folded(struct mm_struct *mm) +{ + if (pgtable_l4_enabled) + return false; + + return true; +} + +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) + static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) { return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) + +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + if (pgtable_l4_enabled) + *p4dp = p4d; + else + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); +} + +static inline int p4d_none(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (p4d_val(p4d) == 0); + + return 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (p4d_val(p4d) & _PAGE_PRESENT); + + return 1; +} + +static inline int p4d_bad(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return !p4d_present(p4d); + + return 0; +} + +static inline void p4d_clear(p4d_t *p4d) +{ + if (pgtable_l4_enabled) + set_p4d(p4d, __p4d(0)); +} + +static inline pud_t *p4d_pgtable(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); + + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); +} + +static inline struct page *p4d_page(p4d_t p4d) +{ + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); +} + +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) + +#define pud_offset pud_offset +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + if (pgtable_l4_enabled) + return p4d_pgtable(*p4d) + pud_index(address); + + return (pud_t *)p4d; +} + #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 2ee413912926..a6b0c89824c2 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -31,7 +31,7 @@ * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to * distinguish them from swapped out pages */ -#define _PAGE_PROT_NONE _PAGE_READ +#define _PAGE_PROT_NONE _PAGE_GLOBAL #define _PAGE_PFN_SHIFT 10 diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index bf204e7c1f74..7e949f25c933 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -24,8 +24,19 @@ #define KERNEL_LINK_ADDR PAGE_OFFSET #endif +/* Number of entries in the page global directory */ +#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) +/* Number of entries in the page table */ +#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) + +/* + * Half of the kernel address space (half of the entries of the page global + * directory) is for the direct mapping. + */ +#define KERN_VIRT_SIZE ((PTRS_PER_PGD / 2 * PGDIR_SIZE) / 2) + #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) -#define VMALLOC_END (PAGE_OFFSET - 1) +#define VMALLOC_END PAGE_OFFSET #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE (SZ_128M) @@ -39,8 +50,10 @@ /* Modules always live before the kernel */ #ifdef CONFIG_64BIT -#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) -#define MODULES_END (PFN_ALIGN((unsigned long)&_start)) +/* This is used to define the end of the KASAN shadow region */ +#define MODULES_LOWEST_VADDR (KERNEL_LINK_ADDR - SZ_2G) +#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) +#define MODULES_END (PFN_ALIGN((unsigned long)&_start)) #endif /* @@ -48,10 +61,16 @@ * struct pages to map half the virtual address space. Then * position vmemmap directly below the VMALLOC region. */ +#ifdef CONFIG_64BIT +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) +#else +#define VA_BITS 32 +#endif + #define VMEMMAP_SHIFT \ - (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) + (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) #define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) -#define VMEMMAP_END (VMALLOC_START - 1) +#define VMEMMAP_END VMALLOC_START #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) /* @@ -83,8 +102,7 @@ #ifndef __ASSEMBLY__ -/* Page Upper Directory not used in RISC-V */ -#include <asm-generic/pgtable-nopud.h> +#include <asm-generic/pgtable-nop4d.h> #include <asm/page.h> #include <asm/tlbflush.h> #include <linux/mm_types.h> @@ -107,19 +125,27 @@ #define XIP_FIXUP(addr) (addr) #endif /* CONFIG_XIP_KERNEL */ -#ifdef CONFIG_MMU -/* Number of entries in the page global directory */ -#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) -/* Number of entries in the page table */ -#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) +struct pt_alloc_ops { + pte_t *(*get_pte_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pte)(uintptr_t va); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_t *(*get_pmd_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pmd)(uintptr_t va); + pud_t *(*get_pud_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pud)(uintptr_t va); +#endif +}; + +extern struct pt_alloc_ops pt_ops __initdata; +#ifdef CONFIG_MMU /* Number of PGD entries that a user-mode program can use */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) /* Page protection bits */ #define _PAGE_BASE (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_USER) -#define PAGE_NONE __pgprot(_PAGE_PROT_NONE) +#define PAGE_NONE __pgprot(_PAGE_PROT_NONE | _PAGE_READ) #define PAGE_READ __pgprot(_PAGE_BASE | _PAGE_READ) #define PAGE_WRITE __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_WRITE) #define PAGE_EXEC __pgprot(_PAGE_BASE | _PAGE_EXEC) @@ -628,11 +654,12 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * * Format of swap PTE: * bit 0: _PAGE_PRESENT (zero) - * bit 1: _PAGE_PROT_NONE (zero) - * bits 2 to 6: swap type - * bits 7 to XLEN-1: swap offset + * bit 1 to 3: _PAGE_LEAF (zero) + * bit 5: _PAGE_PROT_NONE (zero) + * bits 6 to 10: swap type + * bits 10 to XLEN-1: swap offset */ -#define __SWP_TYPE_SHIFT 2 +#define __SWP_TYPE_SHIFT 6 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1UL << __SWP_TYPE_BITS) - 1) #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) @@ -648,12 +675,17 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) +#define __swp_entry_to_pmd(swp) __pmd((swp).val) +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ + /* * In the RV64 Linux scheme, we give the user half of the virtual-address space * and give the kernel the other (upper) half. */ #ifdef CONFIG_64BIT -#define KERN_VIRT_START (-(BIT(CONFIG_VA_BITS)) + TASK_SIZE) +#define KERN_VIRT_START (-(BIT(VA_BITS)) + TASK_SIZE) #else #define KERN_VIRT_START FIXADDR_START #endif @@ -661,11 +693,22 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, /* * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32. * Note that PGDIR_SIZE must evenly divide TASK_SIZE. + * Task size is: + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * + * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V + * Instruction Set Manual Volume II: Privileged Architecture" states that + * "load and store effective addresses, which are 64bits, must have bits + * 63–48 all equal to bit 47, or else a page-fault exception will occur." */ #ifdef CONFIG_64BIT -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) #else -#define TASK_SIZE FIXADDR_START +#define TASK_SIZE FIXADDR_START +#define TASK_SIZE_MIN TASK_SIZE #endif #else /* CONFIG_MMU */ @@ -691,6 +734,8 @@ extern uintptr_t _dtb_early_pa; #define dtb_early_va _dtb_early_va #define dtb_early_pa _dtb_early_pa #endif /* CONFIG_XIP_KERNEL */ +extern u64 satp_mode; +extern bool pgtable_l4_enabled; void paging_init(void); void misc_mem_init(void); diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 9c46dd3ff4a2..d1c37479d828 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -8,6 +8,7 @@ #define _ASM_RISCV_SBI_H #include <linux/types.h> +#include <linux/cpumask.h> #ifdef CONFIG_RISCV_SBI enum sbi_ext_id { @@ -27,6 +28,7 @@ enum sbi_ext_id { SBI_EXT_IPI = 0x735049, SBI_EXT_RFENCE = 0x52464E43, SBI_EXT_HSM = 0x48534D, + SBI_EXT_SRST = 0x53525354, /* Experimentals extensions must lie within this range */ SBI_EXT_EXPERIMENTAL_START = 0x08000000, @@ -78,6 +80,21 @@ enum sbi_hsm_hart_status { SBI_HSM_HART_STATUS_STOP_PENDING, }; +enum sbi_ext_srst_fid { + SBI_EXT_SRST_RESET = 0, +}; + +enum sbi_srst_reset_type { + SBI_SRST_RESET_TYPE_SHUTDOWN = 0, + SBI_SRST_RESET_TYPE_COLD_REBOOT, + SBI_SRST_RESET_TYPE_WARM_REBOOT, +}; + +enum sbi_srst_reset_reason { + SBI_SRST_RESET_REASON_NONE = 0, + SBI_SRST_RESET_REASON_SYS_FAILURE, +}; + #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 #define SBI_SPEC_VERSION_MAJOR_MASK 0x7f @@ -112,27 +129,27 @@ long sbi_get_mimpid(void); void sbi_set_timer(uint64_t stime_value); void sbi_shutdown(void); void sbi_clear_ipi(void); -int sbi_send_ipi(const unsigned long *hart_mask); -int sbi_remote_fence_i(const unsigned long *hart_mask); -int sbi_remote_sfence_vma(const unsigned long *hart_mask, +int sbi_send_ipi(const struct cpumask *cpu_mask); +int sbi_remote_fence_i(const struct cpumask *cpu_mask); +int sbi_remote_sfence_vma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, +int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid); -int sbi_remote_hfence_gvma(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma_vmid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long vmid); -int sbi_remote_hfence_vvma(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid); @@ -157,9 +174,17 @@ static inline unsigned long sbi_minor_version(void) return sbi_spec_version & SBI_SPEC_VERSION_MINOR_MASK; } +/* Make SBI version */ +static inline unsigned long sbi_mk_version(unsigned long major, + unsigned long minor) +{ + return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << + SBI_SPEC_VERSION_MAJOR_SHIFT) | minor; +} + int sbi_err_map_linux_errno(int err); #else /* CONFIG_RISCV_SBI */ -static inline int sbi_remote_fence_i(const unsigned long *hart_mask) { return -1; } +static inline int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return -1; } static inline void sbi_init(void) {} #endif /* CONFIG_RISCV_SBI */ #endif /* _ASM_RISCV_SBI_H */ diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h index a7d2811f3536..23170c933d73 100644 --- a/arch/riscv/include/asm/smp.h +++ b/arch/riscv/include/asm/smp.h @@ -43,7 +43,6 @@ void arch_send_call_function_ipi_mask(struct cpumask *mask); void arch_send_call_function_single_ipi(int cpu); int riscv_hartid_to_cpuid(int hartid); -void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out); /* Set custom IPI operations */ void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops); @@ -63,8 +62,6 @@ asmlinkage void smp_callin(void); #if defined CONFIG_HOTPLUG_CPU int __cpu_disable(void); void __cpu_die(unsigned int cpu); -void cpu_stop(void); -#else #endif /* CONFIG_HOTPLUG_CPU */ #else @@ -85,13 +82,6 @@ static inline unsigned long cpuid_to_hartid_map(int cpu) return boot_cpu_hartid; } -static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in, - struct cpumask *out) -{ - cpumask_clear(out); - cpumask_set_cpu(boot_cpu_hartid, out); -} - static inline void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops) { } diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h index 45a7018a8118..63acaecc3374 100644 --- a/arch/riscv/include/asm/sparsemem.h +++ b/arch/riscv/include/asm/sparsemem.h @@ -4,7 +4,11 @@ #define _ASM_RISCV_SPARSEMEM_H #ifdef CONFIG_SPARSEMEM -#define MAX_PHYSMEM_BITS CONFIG_PA_BITS +#ifdef CONFIG_64BIT +#define MAX_PHYSMEM_BITS 56 +#else +#define MAX_PHYSMEM_BITS 34 +#endif /* CONFIG_64BIT */ #define SECTION_SIZE_BITS 27 #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index f314ff44c48d..c701a5e57a2b 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -8,6 +8,7 @@ #ifndef _ASM_RISCV_UACCESS_H #define _ASM_RISCV_UACCESS_H +#include <asm/asm-extable.h> #include <asm/pgtable.h> /* for TASK_SIZE */ /* @@ -80,25 +81,14 @@ static inline int __access_ok(unsigned long addr, unsigned long size) #define __get_user_asm(insn, x, ptr, err) \ do { \ - uintptr_t __tmp; \ __typeof__(x) __x; \ __asm__ __volatile__ ( \ "1:\n" \ - " " insn " %1, %3\n" \ + " " insn " %1, %2\n" \ "2:\n" \ - " .section .fixup,\"ax\"\n" \ - " .balign 4\n" \ - "3:\n" \ - " li %0, %4\n" \ - " li %1, 0\n" \ - " jump 2b, %2\n" \ - " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .balign " RISCV_SZPTR "\n" \ - " " RISCV_PTR " 1b, 3b\n" \ - " .previous" \ - : "+r" (err), "=&r" (__x), "=r" (__tmp) \ - : "m" (*(ptr)), "i" (-EFAULT)); \ + _ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 2b, %0, %1) \ + : "+r" (err), "=&r" (__x) \ + : "m" (*(ptr))); \ (x) = __x; \ } while (0) @@ -110,30 +100,18 @@ do { \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ u32 __lo, __hi; \ - uintptr_t __tmp; \ __asm__ __volatile__ ( \ "1:\n" \ - " lw %1, %4\n" \ + " lw %1, %3\n" \ "2:\n" \ - " lw %2, %5\n" \ + " lw %2, %4\n" \ "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .balign 4\n" \ - "4:\n" \ - " li %0, %6\n" \ - " li %1, 0\n" \ - " li %2, 0\n" \ - " jump 3b, %3\n" \ - " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .balign " RISCV_SZPTR "\n" \ - " " RISCV_PTR " 1b, 4b\n" \ - " " RISCV_PTR " 2b, 4b\n" \ - " .previous" \ - : "+r" (err), "=&r" (__lo), "=r" (__hi), \ - "=r" (__tmp) \ - : "m" (__ptr[__LSW]), "m" (__ptr[__MSW]), \ - "i" (-EFAULT)); \ + _ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 3b, %0, %1) \ + _ASM_EXTABLE_UACCESS_ERR_ZERO(2b, 3b, %0, %1) \ + : "+r" (err), "=&r" (__lo), "=r" (__hi) \ + : "m" (__ptr[__LSW]), "m" (__ptr[__MSW])); \ + if (err) \ + __hi = 0; \ (x) = (__typeof__(x))((__typeof__((x)-(x)))( \ (((u64)__hi << 32) | __lo))); \ } while (0) @@ -221,24 +199,14 @@ do { \ #define __put_user_asm(insn, x, ptr, err) \ do { \ - uintptr_t __tmp; \ __typeof__(*(ptr)) __x = x; \ __asm__ __volatile__ ( \ "1:\n" \ - " " insn " %z3, %2\n" \ + " " insn " %z2, %1\n" \ "2:\n" \ - " .section .fixup,\"ax\"\n" \ - " .balign 4\n" \ - "3:\n" \ - " li %0, %4\n" \ - " jump 2b, %1\n" \ - " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .balign " RISCV_SZPTR "\n" \ - " " RISCV_PTR " 1b, 3b\n" \ - " .previous" \ - : "+r" (err), "=r" (__tmp), "=m" (*(ptr)) \ - : "rJ" (__x), "i" (-EFAULT)); \ + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %0) \ + : "+r" (err), "=m" (*(ptr)) \ + : "rJ" (__x)); \ } while (0) #ifdef CONFIG_64BIT @@ -249,28 +217,18 @@ do { \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ u64 __x = (__typeof__((x)-(x)))(x); \ - uintptr_t __tmp; \ __asm__ __volatile__ ( \ "1:\n" \ - " sw %z4, %2\n" \ + " sw %z3, %1\n" \ "2:\n" \ - " sw %z5, %3\n" \ + " sw %z4, %2\n" \ "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .balign 4\n" \ - "4:\n" \ - " li %0, %6\n" \ - " jump 3b, %1\n" \ - " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .balign " RISCV_SZPTR "\n" \ - " " RISCV_PTR " 1b, 4b\n" \ - " " RISCV_PTR " 2b, 4b\n" \ - " .previous" \ - : "+r" (err), "=r" (__tmp), \ + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %0) \ + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %0) \ + : "+r" (err), \ "=m" (__ptr[__LSW]), \ "=m" (__ptr[__MSW]) \ - : "rJ" (__x), "rJ" (__x >> 32), "i" (-EFAULT)); \ + : "rJ" (__x), "rJ" (__x >> 32)); \ } while (0) #endif /* CONFIG_64BIT */ @@ -388,81 +346,6 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) __clear_user(to, n) : n; } -/* - * Atomic compare-and-exchange, but with a fixup for userspace faults. Faults - * will set "err" to -EFAULT, while successful accesses return the previous - * value. - */ -#define __cmpxchg_user(ptr, old, new, err, size, lrb, scb) \ -({ \ - __typeof__(ptr) __ptr = (ptr); \ - __typeof__(*(ptr)) __old = (old); \ - __typeof__(*(ptr)) __new = (new); \ - __typeof__(*(ptr)) __ret; \ - __typeof__(err) __err = 0; \ - register unsigned int __rc; \ - __enable_user_access(); \ - switch (size) { \ - case 4: \ - __asm__ __volatile__ ( \ - "0:\n" \ - " lr.w" #scb " %[ret], %[ptr]\n" \ - " bne %[ret], %z[old], 1f\n" \ - " sc.w" #lrb " %[rc], %z[new], %[ptr]\n" \ - " bnez %[rc], 0b\n" \ - "1:\n" \ - ".section .fixup,\"ax\"\n" \ - ".balign 4\n" \ - "2:\n" \ - " li %[err], %[efault]\n" \ - " jump 1b, %[rc]\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - ".balign " RISCV_SZPTR "\n" \ - " " RISCV_PTR " 1b, 2b\n" \ - ".previous\n" \ - : [ret] "=&r" (__ret), \ - [rc] "=&r" (__rc), \ - [ptr] "+A" (*__ptr), \ - [err] "=&r" (__err) \ - : [old] "rJ" (__old), \ - [new] "rJ" (__new), \ - [efault] "i" (-EFAULT)); \ - break; \ - case 8: \ - __asm__ __volatile__ ( \ - "0:\n" \ - " lr.d" #scb " %[ret], %[ptr]\n" \ - " bne %[ret], %z[old], 1f\n" \ - " sc.d" #lrb " %[rc], %z[new], %[ptr]\n" \ - " bnez %[rc], 0b\n" \ - "1:\n" \ - ".section .fixup,\"ax\"\n" \ - ".balign 4\n" \ - "2:\n" \ - " li %[err], %[efault]\n" \ - " jump 1b, %[rc]\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - ".balign " RISCV_SZPTR "\n" \ - " " RISCV_PTR " 1b, 2b\n" \ - ".previous\n" \ - : [ret] "=&r" (__ret), \ - [rc] "=&r" (__rc), \ - [ptr] "+A" (*__ptr), \ - [err] "=&r" (__err) \ - : [old] "rJ" (__old), \ - [new] "rJ" (__new), \ - [efault] "i" (-EFAULT)); \ - break; \ - default: \ - BUILD_BUG(); \ - } \ - __disable_user_access(); \ - (err) = __err; \ - __ret; \ -}) - #define HAVE_GET_KERNEL_NOFAULT #define __get_kernel_nofault(dst, src, type, err_label) \ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 3397ddac1a30..612556faa527 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -43,7 +43,8 @@ obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += cpu_ops.o -obj-$(CONFIG_SMP) += cpu_ops_spinwait.o + +obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 253126e4beef..df0519a64eaf 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -12,6 +12,7 @@ #include <asm/kvm_host.h> #include <asm/thread_info.h> #include <asm/ptrace.h> +#include <asm/cpu_ops_sbi.h> void asm_offsets(void); @@ -468,4 +469,6 @@ void asm_offsets(void) DEFINE(PT_SIZE_ON_STACK, ALIGN(sizeof(struct pt_regs), STACK_ALIGN)); OFFSET(KERNEL_MAP_VIRT_ADDR, kernel_mapping, virt_addr); + OFFSET(SBI_HART_BOOT_TASK_PTR_OFFSET, sbi_hart_boot_data, task_ptr); + OFFSET(SBI_HART_BOOT_STACK_PTR_OFFSET, sbi_hart_boot_data, stack_ptr); } diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c index df84e0c13db1..be7f05b542bb 100644 --- a/arch/riscv/kernel/cpu-hotplug.c +++ b/arch/riscv/kernel/cpu-hotplug.c @@ -14,12 +14,6 @@ #include <asm/cpu_ops.h> #include <asm/sbi.h> -void cpu_stop(void); -void arch_cpu_idle_dead(void) -{ - cpu_stop(); -} - bool cpu_has_hotplug(unsigned int cpu) { if (cpu_ops[cpu]->cpu_stop) @@ -75,7 +69,7 @@ void __cpu_die(unsigned int cpu) /* * Called from the idle thread for the CPU which has been shutdown. */ -void cpu_stop(void) +void arch_cpu_idle_dead(void) { idle_task_exit(); diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index f13b2c9ea912..ad0a7e9f828b 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -7,6 +7,7 @@ #include <linux/seq_file.h> #include <linux/of.h> #include <asm/smp.h> +#include <asm/pgtable.h> /* * Returns the hart ID of the given device tree node, or -ENODEV if the node @@ -71,18 +72,19 @@ static void print_isa(struct seq_file *f, const char *isa) seq_puts(f, "\n"); } -static void print_mmu(struct seq_file *f, const char *mmu_type) +static void print_mmu(struct seq_file *f) { + char sv_type[16]; + #if defined(CONFIG_32BIT) - if (strcmp(mmu_type, "riscv,sv32") != 0) - return; + strncpy(sv_type, "sv32", 5); #elif defined(CONFIG_64BIT) - if (strcmp(mmu_type, "riscv,sv39") != 0 && - strcmp(mmu_type, "riscv,sv48") != 0) - return; + if (pgtable_l4_enabled) + strncpy(sv_type, "sv48", 5); + else + strncpy(sv_type, "sv39", 5); #endif - - seq_printf(f, "mmu\t\t: %s\n", mmu_type+6); + seq_printf(f, "mmu\t\t: %s\n", sv_type); } static void *c_start(struct seq_file *m, loff_t *pos) @@ -107,14 +109,13 @@ static int c_show(struct seq_file *m, void *v) { unsigned long cpu_id = (unsigned long)v - 1; struct device_node *node = of_get_cpu_node(cpu_id, NULL); - const char *compat, *isa, *mmu; + const char *compat, *isa; seq_printf(m, "processor\t: %lu\n", cpu_id); seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id)); if (!of_property_read_string(node, "riscv,isa", &isa)) print_isa(m, isa); - if (!of_property_read_string(node, "mmu-type", &mmu)) - print_mmu(m, mmu); + print_mmu(m); if (!of_property_read_string(node, "compatible", &compat) && strcmp(compat, "riscv")) seq_printf(m, "uarch\t\t: %s\n", compat); diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index 1985884fe829..170d07e57721 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -8,37 +8,29 @@ #include <linux/of.h> #include <linux/string.h> #include <linux/sched.h> -#include <linux/sched/task_stack.h> #include <asm/cpu_ops.h> #include <asm/sbi.h> #include <asm/smp.h> const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; -void *__cpu_up_stack_pointer[NR_CPUS] __section(".data"); -void *__cpu_up_task_pointer[NR_CPUS] __section(".data"); - extern const struct cpu_operations cpu_ops_sbi; +#ifdef CONFIG_RISCV_BOOT_SPINWAIT extern const struct cpu_operations cpu_ops_spinwait; - -void cpu_update_secondary_bootdata(unsigned int cpuid, - struct task_struct *tidle) -{ - int hartid = cpuid_to_hartid_map(cpuid); - - /* Make sure tidle is updated */ - smp_mb(); - WRITE_ONCE(__cpu_up_stack_pointer[hartid], - task_stack_page(tidle) + THREAD_SIZE); - WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle); -} +#else +const struct cpu_operations cpu_ops_spinwait = { + .name = "", + .cpu_prepare = NULL, + .cpu_start = NULL, +}; +#endif void __init cpu_set_ops(int cpuid) { #if IS_ENABLED(CONFIG_RISCV_SBI) if (sbi_probe_extension(SBI_EXT_HSM) > 0) { if (!cpuid) - pr_info("SBI v0.2 HSM extension detected\n"); + pr_info("SBI HSM extension detected\n"); cpu_ops[cpuid] = &cpu_ops_sbi; } else #endif diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c index 685fae72b7f5..dae29cbfe550 100644 --- a/arch/riscv/kernel/cpu_ops_sbi.c +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -7,13 +7,22 @@ #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched/task_stack.h> #include <asm/cpu_ops.h> +#include <asm/cpu_ops_sbi.h> #include <asm/sbi.h> #include <asm/smp.h> extern char secondary_start_sbi[]; const struct cpu_operations cpu_ops_sbi; +/* + * Ordered booting via HSM brings one cpu at a time. However, cpu hotplug can + * be invoked from multiple threads in parallel. Define a per cpu data + * to handle that. + */ +DEFINE_PER_CPU(struct sbi_hart_boot_data, boot_data); + static int sbi_hsm_hart_start(unsigned long hartid, unsigned long saddr, unsigned long priv) { @@ -55,14 +64,19 @@ static int sbi_hsm_hart_get_status(unsigned long hartid) static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle) { - int rc; unsigned long boot_addr = __pa_symbol(secondary_start_sbi); int hartid = cpuid_to_hartid_map(cpuid); - - cpu_update_secondary_bootdata(cpuid, tidle); - rc = sbi_hsm_hart_start(hartid, boot_addr, 0); - - return rc; + unsigned long hsm_data; + struct sbi_hart_boot_data *bdata = &per_cpu(boot_data, cpuid); + + /* Make sure tidle is updated */ + smp_mb(); + bdata->task_ptr = tidle; + bdata->stack_ptr = task_stack_page(tidle) + THREAD_SIZE; + /* Make sure boot data is updated */ + smp_mb(); + hsm_data = __pa(bdata); + return sbi_hsm_hart_start(hartid, boot_addr, hsm_data); } static int sbi_cpu_prepare(unsigned int cpuid) diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c index b2c957bb68c1..346847f6c41c 100644 --- a/arch/riscv/kernel/cpu_ops_spinwait.c +++ b/arch/riscv/kernel/cpu_ops_spinwait.c @@ -6,11 +6,36 @@ #include <linux/errno.h> #include <linux/of.h> #include <linux/string.h> +#include <linux/sched/task_stack.h> #include <asm/cpu_ops.h> #include <asm/sbi.h> #include <asm/smp.h> const struct cpu_operations cpu_ops_spinwait; +void *__cpu_spinwait_stack_pointer[NR_CPUS] __section(".data"); +void *__cpu_spinwait_task_pointer[NR_CPUS] __section(".data"); + +static void cpu_update_secondary_bootdata(unsigned int cpuid, + struct task_struct *tidle) +{ + int hartid = cpuid_to_hartid_map(cpuid); + + /* + * The hartid must be less than NR_CPUS to avoid out-of-bound access + * errors for __cpu_spinwait_stack/task_pointer. That is not always possible + * for platforms with discontiguous hartid numbering scheme. That's why + * spinwait booting is not the recommended approach for any platforms + * booting Linux in S-mode and can be disabled in the future. + */ + if (hartid == INVALID_HARTID || hartid >= NR_CPUS) + return; + + /* Make sure tidle is updated */ + smp_mb(); + WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], + task_stack_page(tidle) + THREAD_SIZE); + WRITE_ONCE(__cpu_spinwait_task_pointer[hartid], tidle); +} static int spinwait_cpu_prepare(unsigned int cpuid) { @@ -28,7 +53,7 @@ static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle) * selects the first cpu to boot the kernel and causes the remainder * of the cpus to spin in a loop waiting for their stack pointer to be * setup by that main cpu. Writing to bootdata - * (i.e __cpu_up_stack_pointer) signals to the spinning cpus that they + * (i.e __cpu_spinwait_stack_pointer) signals to the spinning cpus that they * can continue the boot process. */ cpu_update_secondary_bootdata(cpuid, tidle); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index f52f01ecbeea..2363b43312fc 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -11,6 +11,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/csr.h> +#include <asm/cpu_ops_sbi.h> #include <asm/hwcap.h> #include <asm/image.h> #include "efi-header.S" @@ -105,7 +106,8 @@ relocate: /* Compute satp for kernel page tables, but don't load it yet */ srl a2, a0, PAGE_SHIFT - li a1, SATP_MODE + la a1, satp_mode + REG_L a1, 0(a1) or a2, a2, a1 /* @@ -135,7 +137,7 @@ relocate: /* * Switch to kernel page tables. A full fence is necessary in order to * avoid using the trampoline translations, which are only correct for - * the first superpage. Fetching the fence is guarnteed to work + * the first superpage. Fetching the fence is guaranteed to work * because that first superpage is translated the same way. */ csrw CSR_SATP, a2 @@ -167,18 +169,17 @@ secondary_start_sbi: la a3, .Lsecondary_park csrw CSR_TVEC, a3 - slli a3, a0, LGREG - la a4, __cpu_up_stack_pointer - XIP_FIXUP_OFFSET a4 - la a5, __cpu_up_task_pointer - XIP_FIXUP_OFFSET a5 - add a4, a3, a4 - add a5, a3, a5 - REG_L sp, (a4) - REG_L tp, (a5) - - .global secondary_start_common -secondary_start_common: + /* a0 contains the hartid & a1 contains boot data */ + li a2, SBI_HART_BOOT_TASK_PTR_OFFSET + XIP_FIXUP_OFFSET a2 + add a2, a2, a1 + REG_L tp, (a2) + li a3, SBI_HART_BOOT_STACK_PTR_OFFSET + XIP_FIXUP_OFFSET a3 + add a3, a3, a1 + REG_L sp, (a3) + +.Lsecondary_start_common: #ifdef CONFIG_MMU /* Enable virtual memory and relocate to virtual address */ @@ -258,13 +259,13 @@ pmp_done: li t0, SR_FS csrc CSR_STATUS, t0 -#ifdef CONFIG_SMP +#ifdef CONFIG_RISCV_BOOT_SPINWAIT li t0, CONFIG_NR_CPUS blt a0, t0, .Lgood_cores tail .Lsecondary_park .Lgood_cores: -#endif + /* The lottery system is only required for spinwait booting method */ #ifndef CONFIG_XIP_KERNEL /* Pick one hart to run the main boot sequence */ la a3, hart_lottery @@ -283,6 +284,10 @@ pmp_done: /* first time here if hart_lottery in RAM is not set */ beq t0, t1, .Lsecondary_start +#endif /* CONFIG_XIP */ +#endif /* CONFIG_RISCV_BOOT_SPINWAIT */ + +#ifdef CONFIG_XIP_KERNEL la sp, _end + THREAD_SIZE XIP_FIXUP_OFFSET sp mv s0, a0 @@ -339,16 +344,16 @@ clear_bss_done: call soc_early_init tail start_kernel +#if CONFIG_RISCV_BOOT_SPINWAIT .Lsecondary_start: -#ifdef CONFIG_SMP /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park csrw CSR_TVEC, a3 slli a3, a0, LGREG - la a1, __cpu_up_stack_pointer + la a1, __cpu_spinwait_stack_pointer XIP_FIXUP_OFFSET a1 - la a2, __cpu_up_task_pointer + la a2, __cpu_spinwait_task_pointer XIP_FIXUP_OFFSET a2 add a1, a3, a1 add a2, a3, a2 @@ -365,8 +370,8 @@ clear_bss_done: beqz tp, .Lwait_for_cpu_up fence - tail secondary_start_common -#endif + tail .Lsecondary_start_common +#endif /* CONFIG_RISCV_BOOT_SPINWAIT */ END(_start_kernel) @@ -448,7 +453,3 @@ ENTRY(reset_regs) ret END(reset_regs) #endif /* CONFIG_RISCV_M_MODE */ - -__PAGE_ALIGNED_BSS - /* Empty zero page */ - .balign PAGE_SIZE diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index aabbc3ac3e48..726731ada534 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -16,7 +16,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa); asmlinkage void __init __copy_data(void); #endif -extern void *__cpu_up_stack_pointer[]; -extern void *__cpu_up_task_pointer[]; +#ifdef CONFIG_RISCV_BOOT_SPINWAIT +extern void *__cpu_spinwait_stack_pointer[]; +extern void *__cpu_spinwait_task_pointer[]; +#endif #endif /* __ASM_HEAD_H */ diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S index a80b52a74f58..059c5e216ae7 100644 --- a/arch/riscv/kernel/kexec_relocate.S +++ b/arch/riscv/kernel/kexec_relocate.S @@ -159,25 +159,15 @@ SYM_CODE_START(riscv_kexec_norelocate) * s0: (const) Phys address to jump to * s1: (const) Phys address of the FDT image * s2: (const) The hartid of the current hart - * s3: (const) kernel_map.va_pa_offset, used when switching MMU off */ mv s0, a1 mv s1, a2 mv s2, a3 - mv s3, a4 /* Disable / cleanup interrupts */ csrw CSR_SIE, zero csrw CSR_SIP, zero - /* Switch to physical addressing */ - la s4, 1f - sub s4, s4, s3 - csrw CSR_STVEC, s4 - csrw CSR_SATP, zero - -.align 2 -1: /* Pass the arguments to the next kernel / Cleanup*/ mv a0, s2 mv a1, s1 @@ -214,7 +204,15 @@ SYM_CODE_START(riscv_kexec_norelocate) csrw CSR_SCAUSE, zero csrw CSR_SSCRATCH, zero - jalr zero, a2, 0 + /* + * Switch to physical addressing + * This will also trigger a jump to CSR_STVEC + * which in this case is the address of the new + * kernel. + */ + csrw CSR_STVEC, a2 + csrw CSR_SATP, zero + SYM_CODE_END(riscv_kexec_norelocate) .section ".rodata" diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c index e6eca271a4d6..cbef0fc73afa 100644 --- a/arch/riscv/kernel/machine_kexec.c +++ b/arch/riscv/kernel/machine_kexec.c @@ -169,7 +169,8 @@ machine_kexec(struct kimage *image) struct kimage_arch *internal = &image->arch; unsigned long jump_addr = (unsigned long) image->start; unsigned long first_ind_entry = (unsigned long) &image->head; - unsigned long this_hart_id = raw_smp_processor_id(); + unsigned long this_cpu_id = smp_processor_id(); + unsigned long this_hart_id = cpuid_to_hartid_map(this_cpu_id); unsigned long fdt_addr = internal->fdt_addr; void *control_code_buffer = page_address(image->control_code_page); riscv_kexec_method kexec_method = NULL; diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 9c0511119bad..a89243730153 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -42,12 +42,10 @@ static int riscv_gpr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; struct pt_regs *regs; regs = task_pt_regs(target); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs, 0, -1); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs, 0, -1); } #ifdef CONFIG_FPU diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 7402a417f38e..f72527fcb347 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/pm.h> +#include <linux/reboot.h> #include <asm/sbi.h> #include <asm/smp.h> @@ -15,8 +16,8 @@ unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init; -static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init; -static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask, +static int (*__sbi_send_ipi)(const struct cpumask *cpu_mask) __ro_after_init; +static int (*__sbi_rfence)(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) __ro_after_init; @@ -66,6 +67,30 @@ int sbi_err_map_linux_errno(int err) EXPORT_SYMBOL(sbi_err_map_linux_errno); #ifdef CONFIG_RISCV_SBI_V01 +static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mask) +{ + unsigned long cpuid, hartid; + unsigned long hmask = 0; + + /* + * There is no maximum hartid concept in RISC-V and NR_CPUS must not be + * associated with hartid. As SBI v0.1 is only kept for backward compatibility + * and will be removed in the future, there is no point in supporting hartid + * greater than BITS_PER_LONG (32 for RV32 and 64 for RV64). Ideally, SBI v0.2 + * should be used for platforms with hartid greater than BITS_PER_LONG. + */ + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hartid >= BITS_PER_LONG) { + pr_warn("Unable to send any request to hartid > BITS_PER_LONG for SBI v0.1\n"); + break; + } + hmask |= 1 << hartid; + } + + return hmask; +} + /** * sbi_console_putchar() - Writes given character to the console device. * @ch: The data to be written to the console. @@ -131,33 +156,44 @@ static void __sbi_set_timer_v01(uint64_t stime_value) #endif } -static int __sbi_send_ipi_v01(const unsigned long *hart_mask) +static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) { - sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask, + unsigned long hart_mask; + + if (!cpu_mask) + cpu_mask = cpu_online_mask; + hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); + + sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)(&hart_mask), 0, 0, 0, 0, 0); return 0; } -static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { int result = 0; + unsigned long hart_mask; + + if (!cpu_mask) + cpu_mask = cpu_online_mask; + hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); /* v0.2 function IDs are equivalent to v0.1 extension IDs */ switch (fid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0, - (unsigned long)hart_mask, 0, 0, 0, 0, 0); + (unsigned long)&hart_mask, 0, 0, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0, - (unsigned long)hart_mask, start, size, + (unsigned long)&hart_mask, start, size, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0, - (unsigned long)hart_mask, start, size, + (unsigned long)&hart_mask, start, size, arg4, 0, 0); break; default: @@ -179,7 +215,7 @@ static void __sbi_set_timer_v01(uint64_t stime_value) sbi_major_version(), sbi_minor_version()); } -static int __sbi_send_ipi_v01(const unsigned long *hart_mask) +static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) { pr_warn("IPI extension is not available in SBI v%lu.%lu\n", sbi_major_version(), sbi_minor_version()); @@ -187,7 +223,7 @@ static int __sbi_send_ipi_v01(const unsigned long *hart_mask) return 0; } -static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { @@ -211,37 +247,33 @@ static void __sbi_set_timer_v02(uint64_t stime_value) #endif } -static int __sbi_send_ipi_v02(const unsigned long *hart_mask) +static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask) { - unsigned long hartid, hmask_val, hbase; - struct cpumask tmask; + unsigned long hartid, cpuid, hmask = 0, hbase = 0; struct sbiret ret = {0}; int result; - if (!hart_mask || !(*hart_mask)) { - riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask); - hart_mask = cpumask_bits(&tmask); - } + if (!cpu_mask) + cpu_mask = cpu_online_mask; - hmask_val = 0; - hbase = 0; - for_each_set_bit(hartid, hart_mask, NR_CPUS) { - if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) { + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) { ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, - hmask_val, hbase, 0, 0, 0, 0); + hmask, hbase, 0, 0, 0, 0); if (ret.error) goto ecall_failed; - hmask_val = 0; + hmask = 0; hbase = 0; } - if (!hmask_val) + if (!hmask) hbase = hartid; - hmask_val |= 1UL << (hartid - hbase); + hmask |= 1UL << (hartid - hbase); } - if (hmask_val) { + if (hmask) { ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, - hmask_val, hbase, 0, 0, 0, 0); + hmask, hbase, 0, 0, 0, 0); if (ret.error) goto ecall_failed; } @@ -251,11 +283,11 @@ static int __sbi_send_ipi_v02(const unsigned long *hart_mask) ecall_failed: result = sbi_err_map_linux_errno(ret.error); pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", - __func__, hbase, hmask_val, result); + __func__, hbase, hmask, result); return result; } -static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, +static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask, unsigned long hbase, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) @@ -266,31 +298,31 @@ static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, switch (fid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: - ret = sbi_ecall(ext, fid, hmask_val, hbase, 0, 0, 0, 0); + ret = sbi_ecall(ext, fid, hmask, hbase, 0, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; default: @@ -302,43 +334,39 @@ static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, if (ret.error) { result = sbi_err_map_linux_errno(ret.error); pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", - __func__, hbase, hmask_val, result); + __func__, hbase, hmask, result); } return result; } -static int __sbi_rfence_v02(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { - unsigned long hmask_val, hartid, hbase; - struct cpumask tmask; + unsigned long hartid, cpuid, hmask = 0, hbase = 0; int result; - if (!hart_mask || !(*hart_mask)) { - riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask); - hart_mask = cpumask_bits(&tmask); - } + if (!cpu_mask) + cpu_mask = cpu_online_mask; - hmask_val = 0; - hbase = 0; - for_each_set_bit(hartid, hart_mask, NR_CPUS) { - if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) { - result = __sbi_rfence_v02_call(fid, hmask_val, hbase, + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) { + result = __sbi_rfence_v02_call(fid, hmask, hbase, start, size, arg4, arg5); if (result) return result; - hmask_val = 0; + hmask = 0; hbase = 0; } - if (!hmask_val) + if (!hmask) hbase = hartid; - hmask_val |= 1UL << (hartid - hbase); + hmask |= 1UL << (hartid - hbase); } - if (hmask_val) { - result = __sbi_rfence_v02_call(fid, hmask_val, hbase, + if (hmask) { + result = __sbi_rfence_v02_call(fid, hmask, hbase, start, size, arg4, arg5); if (result) return result; @@ -360,44 +388,44 @@ void sbi_set_timer(uint64_t stime_value) /** * sbi_send_ipi() - Send an IPI to any hart. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_send_ipi(const unsigned long *hart_mask) +int sbi_send_ipi(const struct cpumask *cpu_mask) { - return __sbi_send_ipi(hart_mask); + return __sbi_send_ipi(cpu_mask); } EXPORT_SYMBOL(sbi_send_ipi); /** * sbi_remote_fence_i() - Execute FENCE.I instruction on given remote harts. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_fence_i(const unsigned long *hart_mask) +int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I, - hart_mask, 0, 0, 0, 0); + cpu_mask, 0, 0, 0, 0); } EXPORT_SYMBOL(sbi_remote_fence_i); /** * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote * harts for the specified virtual address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the virtual address * @size: Total size of the virtual address range. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_sfence_vma(const unsigned long *hart_mask, +int sbi_remote_sfence_vma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma); @@ -405,38 +433,38 @@ EXPORT_SYMBOL(sbi_remote_sfence_vma); * sbi_remote_sfence_vma_asid() - Execute SFENCE.VMA instructions on given * remote harts for a virtual address range belonging to a specific ASID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the virtual address * @size: Total size of the virtual address range. * @asid: The value of address space identifier (ASID). * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, +int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, - hart_mask, start, size, asid, 0); + cpu_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma_asid); /** * sbi_remote_hfence_gvma() - Execute HFENCE.GVMA instructions on given remote * harts for the specified guest physical address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the guest physical address * @size: Total size of the guest physical address range. * * Return: None */ -int sbi_remote_hfence_gvma(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma); @@ -444,38 +472,38 @@ EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma); * sbi_remote_hfence_gvma_vmid() - Execute HFENCE.GVMA instructions on given * remote harts for a guest physical address range belonging to a specific VMID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the guest physical address * @size: Total size of the guest physical address range. * @vmid: The value of guest ID (VMID). * * Return: 0 if success, Error otherwise. */ -int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma_vmid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long vmid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID, - hart_mask, start, size, vmid, 0); + cpu_mask, start, size, vmid, 0); } EXPORT_SYMBOL(sbi_remote_hfence_gvma_vmid); /** * sbi_remote_hfence_vvma() - Execute HFENCE.VVMA instructions on given remote * harts for the current guest virtual address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the current guest virtual address * @size: Total size of the current guest virtual address range. * * Return: None */ -int sbi_remote_hfence_vvma(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL(sbi_remote_hfence_vvma); @@ -484,23 +512,49 @@ EXPORT_SYMBOL(sbi_remote_hfence_vvma); * remote harts for current guest virtual address range belonging to a specific * ASID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the current guest virtual address * @size: Total size of the current guest virtual address range. * @asid: The value of address space identifier (ASID). * * Return: None */ -int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID, - hart_mask, start, size, asid, 0); + cpu_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_hfence_vvma_asid); +static void sbi_srst_reset(unsigned long type, unsigned long reason) +{ + sbi_ecall(SBI_EXT_SRST, SBI_EXT_SRST_RESET, type, reason, + 0, 0, 0, 0); + pr_warn("%s: type=0x%lx reason=0x%lx failed\n", + __func__, type, reason); +} + +static int sbi_srst_reboot(struct notifier_block *this, + unsigned long mode, void *cmd) +{ + sbi_srst_reset((mode == REBOOT_WARM || mode == REBOOT_SOFT) ? + SBI_SRST_RESET_TYPE_WARM_REBOOT : + SBI_SRST_RESET_TYPE_COLD_REBOOT, + SBI_SRST_RESET_REASON_NONE); + return NOTIFY_DONE; +} + +static struct notifier_block sbi_srst_reboot_nb; + +static void sbi_srst_power_off(void) +{ + sbi_srst_reset(SBI_SRST_RESET_TYPE_SHUTDOWN, + SBI_SRST_RESET_REASON_NONE); +} + /** * sbi_probe_extension() - Check if an SBI extension ID is supported or not. * @extid: The extension ID to be probed. @@ -564,11 +618,7 @@ long sbi_get_mimpid(void) static void sbi_send_cpumask_ipi(const struct cpumask *target) { - struct cpumask hartid_mask; - - riscv_cpuid_to_hartid_mask(target, &hartid_mask); - - sbi_send_ipi(cpumask_bits(&hartid_mask)); + sbi_send_ipi(target); } static const struct riscv_ipi_ops sbi_ipi_ops = { @@ -608,6 +658,14 @@ void __init sbi_init(void) } else { __sbi_rfence = __sbi_rfence_v01; } + if ((sbi_spec_version >= sbi_mk_version(0, 3)) && + (sbi_probe_extension(SBI_EXT_SRST) > 0)) { + pr_info("SBI SRST extension detected\n"); + pm_power_off = sbi_srst_power_off; + sbi_srst_reboot_nb.notifier_call = sbi_srst_reboot; + sbi_srst_reboot_nb.priority = 192; + register_restart_handler(&sbi_srst_reboot_nb); + } } else { __sbi_set_timer = __sbi_set_timer_v01; __sbi_send_ipi = __sbi_send_ipi_v01; diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 2f6da845c9ae..b5d30ea92292 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -59,16 +59,6 @@ int riscv_hartid_to_cpuid(int hartid) return -ENOENT; } -void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out) -{ - int cpu; - - cpumask_clear(out); - for_each_cpu(cpu, in) - cpumask_set_cpu(cpuid_to_hartid_map(cpu), out); -} -EXPORT_SYMBOL_GPL(riscv_cpuid_to_hartid_mask); - bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { return phys_id == cpuid_to_hartid_map(cpu); diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index bd82375db51a..622f226454d5 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -96,7 +96,7 @@ void __init setup_smp(void) if (cpuid >= NR_CPUS) { pr_warn("Invalid cpuid [%d] for hartid [%d]\n", cpuid, hart); - break; + continue; } cpuid_to_hartid_map(cpuid) = hart; diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S index f5ed08262139..75e0fa8a700a 100644 --- a/arch/riscv/kernel/vmlinux-xip.lds.S +++ b/arch/riscv/kernel/vmlinux-xip.lds.S @@ -45,7 +45,6 @@ SECTIONS ENTRY_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT - *(.fixup) _etext = .; } RO_DATA(L1_CACHE_BYTES) diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 5104f3a871e3..4e6c88aa4d87 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -4,7 +4,7 @@ * Copyright (C) 2017 SiFive */ -#define RO_EXCEPTION_TABLE_ALIGN 16 +#define RO_EXCEPTION_TABLE_ALIGN 4 #ifdef CONFIG_XIP_KERNEL #include "vmlinux-xip.lds.S" @@ -48,7 +48,6 @@ SECTIONS ENTRY_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT - *(.fixup) _etext = .; } diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 9af67dbdc66a..f80a34fbf102 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -114,7 +114,6 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) { - struct cpumask hmask; unsigned long size = PAGE_SIZE; struct kvm_vmid *vmid = &kvm->arch.vmid; @@ -127,8 +126,7 @@ static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) * where the Guest/VM is running. */ preempt_disable(); - riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask); - sbi_remote_hfence_gvma_vmid(cpumask_bits(&hmask), addr, size, + sbi_remote_hfence_gvma_vmid(cpu_online_mask, addr, size, READ_ONCE(vmid->vmid)); preempt_enable(); } diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 00036b7f83b9..1bc0608a5bfd 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -82,7 +82,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run { int ret = 0; unsigned long i; - struct cpumask cm, hm; + struct cpumask cm; struct kvm_vcpu *tmp; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; @@ -90,7 +90,6 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run unsigned long funcid = cp->a6; cpumask_clear(&cm); - cpumask_clear(&hm); kvm_for_each_vcpu(i, tmp, vcpu->kvm) { if (hbase != -1UL) { if (tmp->vcpu_id < hbase) @@ -103,17 +102,15 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run cpumask_set_cpu(tmp->cpu, &cm); } - riscv_cpuid_to_hartid_mask(&cm, &hm); - switch (funcid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: - ret = sbi_remote_fence_i(cpumask_bits(&hm)); + ret = sbi_remote_fence_i(&cm); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - ret = sbi_remote_hfence_vvma(cpumask_bits(&hm), cp->a2, cp->a3); + ret = sbi_remote_hfence_vvma(&cm, cp->a2, cp->a3); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - ret = sbi_remote_hfence_vvma_asid(cpumask_bits(&hm), cp->a2, + ret = sbi_remote_hfence_vvma_asid(&cm, cp->a2, cp->a3, cp->a4); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index 4c7e13ec9ccc..07e2de14433a 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -38,7 +38,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, int i, ret = 0; u64 next_cycle; struct kvm_vcpu *rvcpu; - struct cpumask cm, hm; + struct cpumask cm; struct kvm *kvm = vcpu->kvm; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -101,15 +101,12 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, continue; cpumask_set_cpu(rvcpu->cpu, &cm); } - riscv_cpuid_to_hartid_mask(&cm, &hm); if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I) - ret = sbi_remote_fence_i(cpumask_bits(&hm)); + ret = sbi_remote_fence_i(&cm); else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) - ret = sbi_remote_hfence_vvma(cpumask_bits(&hm), - cp->a1, cp->a2); + ret = sbi_remote_hfence_vvma(&cm, cp->a1, cp->a2); else - ret = sbi_remote_hfence_vvma_asid(cpumask_bits(&hm), - cp->a1, cp->a2, cp->a3); + ret = sbi_remote_hfence_vvma_asid(&cm, cp->a1, cp->a2, cp->a3); break; default: ret = -EINVAL; diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 807228f8f409..2fa4f7b1813d 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -67,7 +67,6 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) { unsigned long i; struct kvm_vcpu *v; - struct cpumask hmask; struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid; if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) @@ -102,8 +101,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) * running, we force VM exits on all host CPUs using IPI and * flush all Guest TLBs. */ - riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask); - sbi_remote_hfence_gvma(cpumask_bits(&hmask), 0, 0); + sbi_remote_hfence_gvma(cpu_online_mask, 0, 0); } vmid->vmid = vmid_next; diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index 63bc691cff91..8c475f4da308 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -1,15 +1,13 @@ #include <linux/linkage.h> #include <asm-generic/export.h> #include <asm/asm.h> +#include <asm/asm-extable.h> #include <asm/csr.h> .macro fixup op reg addr lbl 100: \op \reg, \addr - .section __ex_table,"a" - .balign RISCV_SZPTR - RISCV_PTR 100b, \lbl - .previous + _asm_extable 100b, \lbl .endm ENTRY(__asm_copy_to_user) @@ -173,6 +171,13 @@ ENTRY(__asm_copy_from_user) csrc CSR_STATUS, t6 li a0, 0 ret + + /* Exception fixup code */ +10: + /* Disable access to user memory */ + csrs CSR_STATUS, t6 + mv a0, t5 + ret ENDPROC(__asm_copy_to_user) ENDPROC(__asm_copy_from_user) EXPORT_SYMBOL(__asm_copy_to_user) @@ -218,19 +223,12 @@ ENTRY(__clear_user) addi a0, a0, 1 bltu a0, a3, 5b j 3b -ENDPROC(__clear_user) -EXPORT_SYMBOL(__clear_user) - .section .fixup,"ax" - .balign 4 - /* Fixup code for __copy_user(10) and __clear_user(11) */ -10: - /* Disable access to user memory */ - csrs CSR_STATUS, t6 - mv a0, t5 - ret + /* Exception fixup code */ 11: + /* Disable access to user memory */ csrs CSR_STATUS, t6 mv a0, a1 ret - .previous +ENDPROC(__clear_user) +EXPORT_SYMBOL(__clear_user) diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 89f81067e09e..6cb7d96ad9c7 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -67,10 +67,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local) */ smp_mb(); } else if (IS_ENABLED(CONFIG_RISCV_SBI)) { - cpumask_t hartid_mask; - - riscv_cpuid_to_hartid_mask(&others, &hartid_mask); - sbi_remote_fence_i(cpumask_bits(&hartid_mask)); + sbi_remote_fence_i(&others); } else { on_each_cpu_mask(&others, ipi_remote_fence_i, NULL, 1); } diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index ea54cc0c9106..7acbfbd14557 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) switch_mm_fast: csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | ((cntx & asid_mask) << SATP_ASID_SHIFT) | - SATP_MODE); + satp_mode); if (need_flush_tlb) local_flush_tlb_all(); @@ -201,7 +201,7 @@ switch_mm_fast: static void set_mm_noasid(struct mm_struct *mm) { /* Switch the page table and blindly nuke entire local TLB */ - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); local_flush_tlb_all(); } diff --git a/arch/riscv/mm/extable.c b/arch/riscv/mm/extable.c index ddb7d3b99e89..05978f78579f 100644 --- a/arch/riscv/mm/extable.c +++ b/arch/riscv/mm/extable.c @@ -7,27 +7,65 @@ */ +#include <linux/bitfield.h> #include <linux/extable.h> #include <linux/module.h> #include <linux/uaccess.h> +#include <asm/asm-extable.h> +#include <asm/ptrace.h> -#if defined(CONFIG_BPF_JIT) && defined(CONFIG_ARCH_RV64I) -int rv_bpf_fixup_exception(const struct exception_table_entry *ex, struct pt_regs *regs); -#endif +static inline unsigned long +get_ex_fixup(const struct exception_table_entry *ex) +{ + return ((unsigned long)&ex->fixup + ex->fixup); +} + +static bool ex_handler_fixup(const struct exception_table_entry *ex, + struct pt_regs *regs) +{ + regs->epc = get_ex_fixup(ex); + return true; +} + +static inline void regs_set_gpr(struct pt_regs *regs, unsigned int offset, + unsigned long val) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return; + + if (!offset) + *(unsigned long *)((unsigned long)regs + offset) = val; +} + +static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, + struct pt_regs *regs) +{ + int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + int reg_zero = FIELD_GET(EX_DATA_REG_ZERO, ex->data); + + regs_set_gpr(regs, reg_err, -EFAULT); + regs_set_gpr(regs, reg_zero, 0); + + regs->epc = get_ex_fixup(ex); + return true; +} -int fixup_exception(struct pt_regs *regs) +bool fixup_exception(struct pt_regs *regs) { - const struct exception_table_entry *fixup; + const struct exception_table_entry *ex; - fixup = search_exception_tables(regs->epc); - if (!fixup) - return 0; + ex = search_exception_tables(regs->epc); + if (!ex) + return false; -#if defined(CONFIG_BPF_JIT) && defined(CONFIG_ARCH_RV64I) - if (regs->epc >= BPF_JIT_REGION_START && regs->epc < BPF_JIT_REGION_END) - return rv_bpf_fixup_exception(fixup, regs); -#endif + switch (ex->type) { + case EX_TYPE_FIXUP: + return ex_handler_fixup(ex, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(ex, regs); + case EX_TYPE_UACCESS_ERR_ZERO: + return ex_handler_uaccess_err_zero(ex, regs); + } - regs->epc = fixup->fixup; - return 1; + BUG(); } diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 02a0ee2d3c10..4e9efbe46d5f 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -235,7 +235,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) * only copy the information from the master page table, * nothing more. */ - if (unlikely((addr >= VMALLOC_START) && (addr <= VMALLOC_END))) { + if (unlikely((addr >= VMALLOC_START) && (addr < VMALLOC_END))) { vmalloc_fault(regs, code, addr); return; } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 24b2b8044602..cf4d018b7d66 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -37,13 +37,19 @@ EXPORT_SYMBOL(kernel_map); #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) #endif +#ifdef CONFIG_64BIT +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; +#else +u64 satp_mode = SATP_MODE_32; +#endif +EXPORT_SYMBOL(satp_mode); + +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); +EXPORT_SYMBOL(pgtable_l4_enabled); + phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); -#ifdef CONFIG_XIP_KERNEL -extern char _xiprom[], _exiprom[], __data_loc; -#endif - unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -53,15 +59,6 @@ extern char _start[]; void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; -struct pt_alloc_ops { - pte_t *(*get_pte_virt)(phys_addr_t pa); - phys_addr_t (*alloc_pte)(uintptr_t va); -#ifndef __PAGETABLE_PMD_FOLDED - pmd_t *(*get_pmd_virt)(phys_addr_t pa); - phys_addr_t (*alloc_pmd)(uintptr_t va); -#endif -}; - static phys_addr_t dma32_phys_limit __initdata; static void __init zone_sizes_init(void) @@ -102,10 +99,14 @@ static void __init print_vm_layout(void) (unsigned long)VMALLOC_END); print_mlm("lowmem", (unsigned long)PAGE_OFFSET, (unsigned long)high_memory); -#ifdef CONFIG_64BIT - print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, - (unsigned long)ADDRESS_SPACE_END); + if (IS_ENABLED(CONFIG_64BIT)) { +#ifdef CONFIG_KASAN + print_mlm("kasan", KASAN_SHADOW_START, KASAN_SHADOW_END); #endif + + print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, + (unsigned long)ADDRESS_SPACE_END); + } } #else static void print_vm_layout(void) { } @@ -130,18 +131,8 @@ void __init mem_init(void) print_vm_layout(); } -/* - * The default maximal physical memory size is -PAGE_OFFSET for 32-bit kernel, - * whereas for 64-bit kernel, the end of the virtual address space is occupied - * by the modules/BPF/kernel mappings which reduces the available size of the - * linear mapping. - * Limit the memory size via mem. - */ -#ifdef CONFIG_64BIT -static phys_addr_t memory_limit = -PAGE_OFFSET - SZ_4G; -#else -static phys_addr_t memory_limit = -PAGE_OFFSET; -#endif +/* Limit the memory size via mem. */ +static phys_addr_t memory_limit; static int __init early_mem(char *p) { @@ -162,35 +153,31 @@ early_param("mem", early_mem); static void __init setup_bootmem(void) { phys_addr_t vmlinux_end = __pa_symbol(&_end); - phys_addr_t vmlinux_start = __pa_symbol(&_start); - phys_addr_t __maybe_unused max_mapped_addr; - phys_addr_t phys_ram_end; + phys_addr_t max_mapped_addr; + phys_addr_t phys_ram_end, vmlinux_start; -#ifdef CONFIG_XIP_KERNEL - vmlinux_start = __pa_symbol(&_sdata); -#endif + if (IS_ENABLED(CONFIG_XIP_KERNEL)) + vmlinux_start = __pa_symbol(&_sdata); + else + vmlinux_start = __pa_symbol(&_start); memblock_enforce_memory_limit(memory_limit); /* - * Reserve from the start of the kernel to the end of the kernel - */ -#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX) - /* * Make sure we align the reservation on PMD_SIZE since we will * map the kernel in the linear mapping as read-only: we do not want * any allocation to happen between _end and the next pmd aligned page. */ - vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK; -#endif + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK; + /* + * Reserve from the start of the kernel to the end of the kernel + */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); - phys_ram_end = memblock_end_of_DRAM(); -#ifndef CONFIG_64BIT -#ifndef CONFIG_XIP_KERNEL - phys_ram_base = memblock_start_of_DRAM(); -#endif + if (!IS_ENABLED(CONFIG_XIP_KERNEL)) + phys_ram_base = memblock_start_of_DRAM(); /* * memblock allocator is not aware of the fact that last 4K bytes of * the addressable memory can not be mapped because of IS_ERR_VALUE @@ -200,10 +187,11 @@ static void __init setup_bootmem(void) * address space is occupied by the kernel mapping then this check must * be done as soon as the kernel mapping base address is determined. */ - max_mapped_addr = __pa(~(ulong)0); - if (max_mapped_addr == (phys_ram_end - 1)) - memblock_set_current_limit(max_mapped_addr - 4096); -#endif + if (!IS_ENABLED(CONFIG_64BIT)) { + max_mapped_addr = __pa(~(ulong)0); + if (max_mapped_addr == (phys_ram_end - 1)) + memblock_set_current_limit(max_mapped_addr - 4096); + } min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); @@ -229,13 +217,7 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -static struct pt_alloc_ops _pt_ops __initdata; - -#ifdef CONFIG_XIP_KERNEL -#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) -#else -#define pt_ops _pt_ops -#endif +struct pt_alloc_ops pt_ops __initdata; unsigned long riscv_pfn_base __ro_after_init; EXPORT_SYMBOL(riscv_pfn_base); @@ -245,9 +227,11 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL +#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) #define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) #define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) #define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) @@ -333,6 +317,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) #endif /* CONFIG_XIP_KERNEL */ +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); + +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) +#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud)) +#define early_pud ((pud_t *)XIP_FIXUP(early_pud)) +#endif /* CONFIG_XIP_KERNEL */ + static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { /* Before MMU is enabled */ @@ -352,7 +346,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) static phys_addr_t __init alloc_pmd_early(uintptr_t va) { - BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); return (uintptr_t)early_pmd; } @@ -367,7 +361,8 @@ static phys_addr_t __init alloc_pmd_late(uintptr_t va) unsigned long vaddr; vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); + BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page(vaddr))); + return __pa(vaddr); } @@ -398,21 +393,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp, create_pte_mapping(ptep, va, pa, sz, prot); } -#define pgd_next_t pmd_t -#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) +static pud_t *__init get_pud_virt_early(phys_addr_t pa) +{ + return (pud_t *)((uintptr_t)pa); +} + +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PUD); + return (pud_t *)set_fixmap_offset(FIX_PUD, pa); +} + +static pud_t *__init get_pud_virt_late(phys_addr_t pa) +{ + return (pud_t *)__va(pa); +} + +static phys_addr_t __init alloc_pud_early(uintptr_t va) +{ + /* Only one PUD is available for early mapping */ + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + + return (uintptr_t)early_pud; +} + +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t alloc_pud_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + BUG_ON(!vaddr); + return __pa(vaddr); +} + +static void __init create_pud_mapping(pud_t *pudp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + pmd_t *nextp; + phys_addr_t next_phys; + uintptr_t pud_index = pud_index(va); + + if (sz == PUD_SIZE) { + if (pud_val(pudp[pud_index]) == 0) + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); + return; + } + + if (pud_val(pudp[pud_index]) == 0) { + next_phys = pt_ops.alloc_pmd(va); + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); + nextp = pt_ops.get_pmd_virt(next_phys); + memset(nextp, 0, PAGE_SIZE); + } else { + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); + nextp = pt_ops.get_pmd_virt(next_phys); + } + + create_pmd_mapping(nextp, va, pa, sz, prot); +} + +#define pgd_next_t pud_t +#define alloc_pgd_next(__va) (pgtable_l4_enabled ? \ + pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va)) +#define get_pgd_next_virt(__pa) (pgtable_l4_enabled ? \ + pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa)) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) -#define fixmap_pgd_next fixmap_pmd + (pgtable_l4_enabled ? \ + create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \ + create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot)) +#define fixmap_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd) +#define trampoline_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd) +#define early_dtb_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd) #else #define pgd_next_t pte_t #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) -#define fixmap_pgd_next fixmap_pte +#define fixmap_pgd_next ((uintptr_t)fixmap_pte) +#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) -#endif +#endif /* __PAGETABLE_PMD_FOLDED */ void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, @@ -451,6 +522,8 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) } #ifdef CONFIG_XIP_KERNEL +extern char _xiprom[], _exiprom[], __data_loc; + /* called from head.S with MMU off */ asmlinkage void __init __copy_data(void) { @@ -499,6 +572,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) } #endif /* CONFIG_STRICT_KERNEL_RWX */ +#ifdef CONFIG_64BIT +static void __init disable_pgtable_l4(void) +{ + pgtable_l4_enabled = false; + kernel_map.page_offset = PAGE_OFFSET_L3; + satp_mode = SATP_MODE_39; +} + +/* + * There is a simple way to determine if 4-level is supported by the + * underlying hardware: establish 1:1 mapping in 4-level page table mode + * then read SATP to see if the configuration was taken into account + * meaning sv48 is supported. + */ +static __init void set_satp_mode(void) +{ + u64 identity_satp, hw_satp; + uintptr_t set_satp_mode_pmd; + + set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; + create_pgd_mapping(early_pg_dir, + set_satp_mode_pmd, (uintptr_t)early_pud, + PGDIR_SIZE, PAGE_TABLE); + create_pud_mapping(early_pud, + set_satp_mode_pmd, (uintptr_t)early_pmd, + PUD_SIZE, PAGE_TABLE); + /* Handle the case where set_satp_mode straddles 2 PMDs */ + create_pmd_mapping(early_pmd, + set_satp_mode_pmd, set_satp_mode_pmd, + PMD_SIZE, PAGE_KERNEL_EXEC); + create_pmd_mapping(early_pmd, + set_satp_mode_pmd + PMD_SIZE, + set_satp_mode_pmd + PMD_SIZE, + PMD_SIZE, PAGE_KERNEL_EXEC); + + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; + + local_flush_tlb_all(); + csr_write(CSR_SATP, identity_satp); + hw_satp = csr_swap(CSR_SATP, 0ULL); + local_flush_tlb_all(); + + if (hw_satp != identity_satp) + disable_pgtable_l4(); + + memset(early_pg_dir, 0, PAGE_SIZE); + memset(early_pud, 0, PAGE_SIZE); + memset(early_pmd, 0, PAGE_SIZE); +} +#endif + /* * setup_vm() is called from head.S with MMU-off. * @@ -563,10 +687,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, + IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, PGDIR_SIZE, IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); + if (pgtable_l4_enabled) { + create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, + (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); + } + if (IS_ENABLED(CONFIG_64BIT)) { create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, pa, PMD_SIZE, PAGE_KERNEL); @@ -588,11 +717,64 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) dtb_early_pa = dtb_pa; } +/* + * MMU is not enabled, the page tables are allocated directly using + * early_pmd/pud/p4d and the address returned is the physical one. + */ +void __init pt_ops_set_early(void) +{ + pt_ops.alloc_pte = alloc_pte_early; + pt_ops.get_pte_virt = get_pte_virt_early; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_early; + pt_ops.get_pmd_virt = get_pmd_virt_early; + pt_ops.alloc_pud = alloc_pud_early; + pt_ops.get_pud_virt = get_pud_virt_early; +#endif +} + +/* + * MMU is enabled but page table setup is not complete yet. + * fixmap page table alloc functions must be used as a means to temporarily + * map the allocated physical pages since the linear mapping does not exist yet. + * + * Note that this is called with MMU disabled, hence kernel_mapping_pa_to_va, + * but it will be used as described above. + */ +void __init pt_ops_set_fixmap(void) +{ + pt_ops.alloc_pte = kernel_mapping_pa_to_va((uintptr_t)alloc_pte_fixmap); + pt_ops.get_pte_virt = kernel_mapping_pa_to_va((uintptr_t)get_pte_virt_fixmap); +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); + pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); + pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); + pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); +#endif +} + +/* + * MMU is enabled and page table setup is complete, so from now, we can use + * generic page allocation functions to setup page table. + */ +void __init pt_ops_set_late(void) +{ + pt_ops.alloc_pte = alloc_pte_late; + pt_ops.get_pte_virt = get_pte_virt_late; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_late; + pt_ops.get_pmd_virt = get_pmd_virt_late; + pt_ops.alloc_pud = alloc_pud_late; + pt_ops.get_pud_virt = get_pud_virt_late; +#endif +} + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; kernel_map.virt_addr = KERNEL_LINK_ADDR; + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); #ifdef CONFIG_XIP_KERNEL kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; @@ -607,11 +789,24 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.phys_addr = (uintptr_t)(&_start); kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; #endif + +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) + set_satp_mode(); +#endif + kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; riscv_pfn_base = PFN_DOWN(kernel_map.phys_addr); + /* + * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit + * kernel, whereas for 64-bit kernel, the end of the virtual address + * space is occupied by the modules/BPF/kernel mappings which reduces + * the available size of the linear mapping. + */ + memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0); + /* Sanity check alignment and size */ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); BUG_ON((kernel_map.phys_addr % PMD_SIZE) != 0); @@ -624,23 +819,25 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K); #endif - pt_ops.alloc_pte = alloc_pte_early; - pt_ops.get_pte_virt = get_pte_virt_early; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_early; - pt_ops.get_pmd_virt = get_pmd_virt_early; -#endif + pt_ops_set_early(); + /* Setup early PGD for fixmap */ create_pgd_mapping(early_pg_dir, FIXADDR_START, - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); #ifndef __PAGETABLE_PMD_FOLDED - /* Setup fixmap PMD */ + /* Setup fixmap PUD and PMD */ + if (pgtable_l4_enabled) + create_pud_mapping(fixmap_pud, FIXADDR_START, + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); create_pmd_mapping(fixmap_pmd, FIXADDR_START, (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); /* Setup trampoline PGD and PMD */ create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); + if (pgtable_l4_enabled) + create_pud_mapping(trampoline_pud, kernel_map.virt_addr, + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); #ifdef CONFIG_XIP_KERNEL create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); @@ -668,7 +865,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap * range can not span multiple pmds. */ - BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); #ifndef __PAGETABLE_PMD_FOLDED @@ -693,6 +890,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN); } #endif + + pt_ops_set_fixmap(); } static void __init setup_vm_final(void) @@ -701,16 +900,6 @@ static void __init setup_vm_final(void) phys_addr_t pa, start, end; u64 i; - /** - * MMU is enabled at this point. But page table setup is not complete yet. - * fixmap page table alloc functions should be used at this point - */ - pt_ops.alloc_pte = alloc_pte_fixmap; - pt_ops.get_pte_virt = get_pte_virt_fixmap; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_fixmap; - pt_ops.get_pmd_virt = get_pmd_virt_fixmap; -#endif /* Setup swapper PGD for fixmap */ create_pgd_mapping(swapper_pg_dir, FIXADDR_START, __pa_symbol(fixmap_pgd_next), @@ -735,26 +924,24 @@ static void __init setup_vm_final(void) } } -#ifdef CONFIG_64BIT /* Map the kernel */ - create_kernel_page_table(swapper_pg_dir, false); + if (IS_ENABLED(CONFIG_64BIT)) + create_kernel_page_table(swapper_pg_dir, false); + +#ifdef CONFIG_KASAN + kasan_swapper_init(); #endif /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); + clear_fixmap(FIX_PUD); /* Move to swapper page table */ - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); local_flush_tlb_all(); - /* generic page allocation functions must be used to setup page table */ - pt_ops.alloc_pte = alloc_pte_late; - pt_ops.get_pte_virt = get_pte_virt_late; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_late; - pt_ops.get_pmd_virt = get_pmd_virt_late; -#endif + pt_ops_set_late(); } #else asmlinkage void __init setup_vm(uintptr_t dtb_pa) @@ -790,12 +977,10 @@ static void __init reserve_crashkernel(void) * since it doesn't make much sense and we have limited memory * resources. */ -#ifdef CONFIG_CRASH_DUMP if (is_kdump_kernel()) { pr_info("crashkernel: ignoring reservation request\n"); return; } -#endif ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); @@ -812,13 +997,22 @@ static void __init reserve_crashkernel(void) /* * Current riscv boot protocol requires 2MB alignment for * RV64 and 4MB alignment for RV32 (hugepage size) + * + * Try to alloc from 32bit addressible physical memory so that + * swiotlb can work on the crash kernel. */ crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, search_end); + search_start, + min(search_end, (unsigned long) SZ_4G)); if (crash_base == 0) { - pr_warn("crashkernel: couldn't allocate %lldKB\n", - crash_size >> 10); - return; + /* Try again without restricting region to 32bit addressible memory */ + crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, + search_start, search_end); + if (crash_base == 0) { + pr_warn("crashkernel: couldn't allocate %lldKB\n", + crash_size >> 10); + return; + } } pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 54294f83513d..f61f7ca6fe0f 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -11,45 +11,27 @@ #include <asm/fixmap.h> #include <asm/pgalloc.h> -extern pgd_t early_pg_dir[PTRS_PER_PGD]; -asmlinkage void __init kasan_early_init(void) -{ - uintptr_t i; - pgd_t *pgd = early_pg_dir + pgd_index(KASAN_SHADOW_START); - - BUILD_BUG_ON(KASAN_SHADOW_OFFSET != - KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); - - for (i = 0; i < PTRS_PER_PTE; ++i) - set_pte(kasan_early_shadow_pte + i, - mk_pte(virt_to_page(kasan_early_shadow_page), - PAGE_KERNEL)); - - for (i = 0; i < PTRS_PER_PMD; ++i) - set_pmd(kasan_early_shadow_pmd + i, - pfn_pmd(PFN_DOWN - (__pa((uintptr_t) kasan_early_shadow_pte)), - __pgprot(_PAGE_TABLE))); - - for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END; - i += PGDIR_SIZE, ++pgd) - set_pgd(pgd, - pfn_pgd(PFN_DOWN - (__pa(((uintptr_t) kasan_early_shadow_pmd))), - __pgprot(_PAGE_TABLE))); - - /* init for swapper_pg_dir */ - pgd = pgd_offset_k(KASAN_SHADOW_START); - - for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END; - i += PGDIR_SIZE, ++pgd) - set_pgd(pgd, - pfn_pgd(PFN_DOWN - (__pa(((uintptr_t) kasan_early_shadow_pmd))), - __pgprot(_PAGE_TABLE))); +/* + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 + * which is right before the kernel. + * + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate + * the page global directory with kasan_early_shadow_pmd. + * + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping + * must be divided as follows: + * - the first PGD entry, although incomplete, is populated with + * kasan_early_shadow_pud/p4d + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d + * - the last PGD entry is shared with the kernel mapping so populated at the + * lower levels pud/p4d + * + * In addition, when shallow populating a kasan region (for example vmalloc), + * this region may also not be aligned on PGDIR size, so we must go down to the + * pud level too. + */ - local_flush_tlb_all(); -} +extern pgd_t early_pg_dir[PTRS_PER_PGD]; static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { @@ -73,15 +55,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); } -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pmd_t *pmdp, *base_pmd; unsigned long next; - base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); - if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + if (pud_none(*pud)) { base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + } else { + base_pmd = (pmd_t *)pud_pgtable(*pud); + if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + } pmdp = base_pmd + pmd_index(vaddr); @@ -105,59 +91,207 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned * it entirely, memblock could allocate a page at a physical address * where KASAN is not populated yet and then we'd get a page fault. */ - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); + set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); +} + +static void __init kasan_populate_pud(pgd_t *pgd, + unsigned long vaddr, unsigned long end, + bool early) +{ + phys_addr_t phys_addr; + pud_t *pudp, *base_pud; + unsigned long next; + + if (early) { + /* + * We can't use pgd_page_vaddr here as it would return a linear + * mapping address but it is not mapped yet, but when populating + * early_pg_dir, we need the physical address and when populating + * swapper_pg_dir, we need the kernel virtual address so use + * pt_ops facility. + */ + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); + } else { + base_pud = (pud_t *)pgd_page_vaddr(*pgd); + if (base_pud == lm_alias(kasan_early_shadow_pud)) + base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + } + + pudp = base_pud + pud_index(vaddr); + + do { + next = pud_addr_end(vaddr, end); + + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { + if (early) { + phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } else { + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); + if (phys_addr) { + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } + } + } + + kasan_populate_pmd(pudp, vaddr, next); + } while (pudp++, vaddr = next, vaddr != end); + + /* + * Wait for the whole PGD to be populated before setting the PGD in + * the page table, otherwise, if we did set the PGD before populating + * it entirely, memblock could allocate a page at a physical address + * where KASAN is not populated yet and then we'd get a page fault. + */ + if (!early) + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); } -static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end) +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)kasan_early_shadow_pud : \ + (uintptr_t)kasan_early_shadow_pmd) +#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \ + (pgtable_l4_enabled ? \ + kasan_populate_pud(pgdp, vaddr, next, early) : \ + kasan_populate_pmd((pud_t *)pgdp, vaddr, next)) + +static void __init kasan_populate_pgd(pgd_t *pgdp, + unsigned long vaddr, unsigned long end, + bool early) { phys_addr_t phys_addr; - pgd_t *pgdp = pgd_offset_k(vaddr); unsigned long next; do { next = pgd_addr_end(vaddr, end); - /* - * pgdp can't be none since kasan_early_init initialized all KASAN - * shadow region with kasan_early_shadow_pmd: if this is stillthe case, - * that means we can try to allocate a hugepage as a replacement. - */ - if (pgd_page_vaddr(*pgdp) == (unsigned long)lm_alias(kasan_early_shadow_pmd) && - IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { - phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); - if (phys_addr) { - set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { + if (early) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_pgd_next); + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_TABLE)); continue; + } else if (pgd_page_vaddr(*pgdp) == + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)) { + /* + * pgdp can't be none since kasan_early_init + * initialized all KASAN shadow region with + * kasan_early_shadow_pud: if this is still the + * case, that means we can try to allocate a + * hugepage as a replacement. + */ + phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); + if (phys_addr) { + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } } } - kasan_populate_pmd(pgdp, vaddr, next); + kasan_populate_pgd_next(pgdp, vaddr, next, early); } while (pgdp++, vaddr = next, vaddr != end); } +asmlinkage void __init kasan_early_init(void) +{ + uintptr_t i; + + BUILD_BUG_ON(KASAN_SHADOW_OFFSET != + KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); + + for (i = 0; i < PTRS_PER_PTE; ++i) + set_pte(kasan_early_shadow_pte + i, + mk_pte(virt_to_page(kasan_early_shadow_page), + PAGE_KERNEL)); + + for (i = 0; i < PTRS_PER_PMD; ++i) + set_pmd(kasan_early_shadow_pmd + i, + pfn_pmd(PFN_DOWN + (__pa((uintptr_t)kasan_early_shadow_pte)), + PAGE_TABLE)); + + if (pgtable_l4_enabled) { + for (i = 0; i < PTRS_PER_PUD; ++i) + set_pud(kasan_early_shadow_pud + i, + pfn_pud(PFN_DOWN + (__pa(((uintptr_t)kasan_early_shadow_pmd))), + PAGE_TABLE)); + } + + kasan_populate_pgd(early_pg_dir + pgd_index(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END, true); + + local_flush_tlb_all(); +} + +void __init kasan_swapper_init(void) +{ + kasan_populate_pgd(pgd_offset_k(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END, true); + + local_flush_tlb_all(); +} + static void __init kasan_populate(void *start, void *end) { unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); - kasan_populate_pgd(vaddr, vend); + kasan_populate_pgd(pgd_offset_k(vaddr), vaddr, vend, false); local_flush_tlb_all(); memset(start, KASAN_SHADOW_INIT, end - start); } +static void __init kasan_shallow_populate_pud(pgd_t *pgdp, + unsigned long vaddr, unsigned long end, + bool kasan_populate) +{ + unsigned long next; + pud_t *pudp, *base_pud; + pmd_t *base_pmd; + bool is_kasan_pmd; + + base_pud = (pud_t *)pgd_page_vaddr(*pgdp); + pudp = base_pud + pud_index(vaddr); + + if (kasan_populate) + memcpy(base_pud, (void *)kasan_early_shadow_pgd_next, + sizeof(pud_t) * PTRS_PER_PUD); + + do { + next = pud_addr_end(vaddr, end); + is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); + + if (is_kasan_pmd) { + base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); + } + } while (pudp++, vaddr = next, vaddr != end); +} + static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) { unsigned long next; void *p; pgd_t *pgd_k = pgd_offset_k(vaddr); + bool is_kasan_pgd_next; do { next = pgd_addr_end(vaddr, end); - if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { + is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); + + if (is_kasan_pgd_next) { p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } + + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) + continue; + + kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next); } while (pgd_k++, vaddr = next, vaddr != end); } diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 64f8201237c2..37ed760d007c 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -32,7 +32,6 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, unsigned long size, unsigned long stride) { struct cpumask *cmask = mm_cpumask(mm); - struct cpumask hmask; unsigned int cpuid; bool broadcast; @@ -46,9 +45,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, unsigned long asid = atomic_long_read(&mm->context.id); if (broadcast) { - riscv_cpuid_to_hartid_mask(cmask, &hmask); - sbi_remote_sfence_vma_asid(cpumask_bits(&hmask), - start, size, asid); + sbi_remote_sfence_vma_asid(cmask, start, size, asid); } else if (size <= stride) { local_flush_tlb_page_asid(start, asid); } else { @@ -56,9 +53,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, } } else { if (broadcast) { - riscv_cpuid_to_hartid_mask(cmask, &hmask); - sbi_remote_sfence_vma(cpumask_bits(&hmask), - start, size); + sbi_remote_sfence_vma(cmask, start, size); } else if (size <= stride) { local_flush_tlb_page(start); } else { diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 603630b6f3c5..0bcda99d1d68 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -458,10 +458,8 @@ static int emit_call(bool fixed, u64 addr, struct rv_jit_context *ctx) #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) -int rv_bpf_fixup_exception(const struct exception_table_entry *ex, - struct pt_regs *regs); -int rv_bpf_fixup_exception(const struct exception_table_entry *ex, - struct pt_regs *regs) +bool ex_handler_bpf(const struct exception_table_entry *ex, + struct pt_regs *regs) { off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); @@ -469,7 +467,7 @@ int rv_bpf_fixup_exception(const struct exception_table_entry *ex, *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0; regs->epc = (unsigned long)&ex->fixup - offset; - return 1; + return true; } /* For accesses to BTF pointers, add an entry to the exception table */ @@ -499,7 +497,7 @@ static int add_exception_handler(const struct bpf_insn *insn, offset = pc - (long)&ex->insn; if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) return -ERANGE; - ex->insn = pc; + ex->insn = offset; /* * Since the extable follows the program, the fixup offset is always @@ -515,6 +513,7 @@ static int add_exception_handler(const struct bpf_insn *insn, ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, offset) | FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + ex->type = EX_TYPE_BPF; ctx->nexentries++; return 0; diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile index 3b860061e84d..d04e0e7de0b3 100644 --- a/arch/s390/boot/compressed/Makefile +++ b/arch/s390/boot/compressed/Makefile @@ -58,8 +58,6 @@ OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -vmlinux.bin.all-y := $(obj)/vmlinux.bin - suffix-$(CONFIG_KERNEL_GZIP) := .gz suffix-$(CONFIG_KERNEL_BZIP2) := .bz2 suffix-$(CONFIG_KERNEL_LZ4) := .lz4 @@ -68,20 +66,20 @@ suffix-$(CONFIG_KERNEL_LZO) := .lzo suffix-$(CONFIG_KERNEL_XZ) := .xz suffix-$(CONFIG_KERNEL_ZSTD) := .zst -$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lz4) -$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) -$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) -$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) -$(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE - $(call if_changed,zstd22) +$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE + $(call if_changed,bzip2_with_size) +$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE + $(call if_changed,lz4_with_size) +$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzma_with_size) +$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzo_with_size) +$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE + $(call if_changed,xzkern_with_size) +$(obj)/vmlinux.bin.zst: $(obj)/vmlinux.bin FORCE + $(call if_changed,zstd22_with_size) OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed $(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 0d90cbeb89b4..e3f12db46cfc 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -109,7 +109,9 @@ struct hws_basic_entry { unsigned int AS:2; /* 29-30 PSW address-space control */ unsigned int I:1; /* 31 entry valid or invalid */ unsigned int CL:2; /* 32-33 Configuration Level */ - unsigned int:14; + unsigned int H:1; /* 34 Host Indicator */ + unsigned int LS:1; /* 35 Limited Sampling */ + unsigned int:12; unsigned int prim_asn:16; /* primary ASN */ unsigned long long ia; /* Instruction Address */ unsigned long long gpp; /* Guest Program Parameter */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index ce550d06abc3..147cb3534ce4 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -49,51 +49,85 @@ int __get_user_bad(void) __attribute__((noreturn)); #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES -#define __put_get_user_asm(to, from, size, insn) \ -({ \ - int __rc; \ - \ - asm volatile( \ - insn " 0,%[spec]\n" \ - "0: mvcos %[_to],%[_from],%[_size]\n" \ - "1: xr %[rc],%[rc]\n" \ - "2:\n" \ - ".pushsection .fixup, \"ax\"\n" \ - "3: lhi %[rc],%[retval]\n" \ - " jg 2b\n" \ - ".popsection\n" \ - EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ - : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ - : [_size] "d" (size), [_from] "Q" (*(from)), \ - [retval] "K" (-EFAULT), [spec] "K" (0x81UL) \ - : "cc", "0"); \ - __rc; \ +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + +#define __put_get_user_asm(to, from, size, oac_spec) \ +({ \ + int __rc; \ + \ + asm volatile( \ + " lr 0,%[spec]\n" \ + "0: mvcos %[_to],%[_from],%[_size]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: lhi %[rc],%[retval]\n" \ + " jg 2b\n" \ + ".popsection\n" \ + EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ + : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ + : [_size] "d" (size), [_from] "Q" (*(from)), \ + [retval] "K" (-EFAULT), [spec] "d" (oac_spec.val) \ + : "cc", "0"); \ + __rc; \ }) +#define __put_user_asm(to, from, size) \ + __put_get_user_asm(to, from, size, ((union oac) { \ + .oac1.as = PSW_BITS_AS_SECONDARY, \ + .oac1.a = 1 \ + })) + +#define __get_user_asm(to, from, size) \ + __put_get_user_asm(to, from, size, ((union oac) { \ + .oac2.as = PSW_BITS_AS_SECONDARY, \ + .oac2.a = 1 \ + })) \ + static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) { int rc; switch (size) { case 1: - rc = __put_get_user_asm((unsigned char __user *)ptr, - (unsigned char *)x, - size, "llilh"); + rc = __put_user_asm((unsigned char __user *)ptr, + (unsigned char *)x, + size); break; case 2: - rc = __put_get_user_asm((unsigned short __user *)ptr, - (unsigned short *)x, - size, "llilh"); + rc = __put_user_asm((unsigned short __user *)ptr, + (unsigned short *)x, + size); break; case 4: - rc = __put_get_user_asm((unsigned int __user *)ptr, - (unsigned int *)x, - size, "llilh"); + rc = __put_user_asm((unsigned int __user *)ptr, + (unsigned int *)x, + size); break; case 8: - rc = __put_get_user_asm((unsigned long __user *)ptr, - (unsigned long *)x, - size, "llilh"); + rc = __put_user_asm((unsigned long __user *)ptr, + (unsigned long *)x, + size); break; default: __put_user_bad(); @@ -108,24 +142,24 @@ static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsign switch (size) { case 1: - rc = __put_get_user_asm((unsigned char *)x, - (unsigned char __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned char *)x, + (unsigned char __user *)ptr, + size); break; case 2: - rc = __put_get_user_asm((unsigned short *)x, - (unsigned short __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned short *)x, + (unsigned short __user *)ptr, + size); break; case 4: - rc = __put_get_user_asm((unsigned int *)x, - (unsigned int __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned int *)x, + (unsigned int __user *)ptr, + size); break; case 8: - rc = __put_get_user_asm((unsigned long *)x, - (unsigned long __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned long *)x, + (unsigned long __user *)ptr, + size); break; default: __get_user_bad(); diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 30f0242de4a5..8ee48672233f 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -178,7 +178,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, case CPUMF_CTR_SET_CRYPTO: if (info->csvn >= 1 && info->csvn <= 5) ctrset_size = 16; - else if (info->csvn == 6) + else if (info->csvn == 6 || info->csvn == 7) ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: @@ -188,7 +188,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, ctrset_size = 48; else if (info->csvn >= 3 && info->csvn <= 5) ctrset_size = 128; - else if (info->csvn == 6) + else if (info->csvn == 6 || info->csvn == 7) ctrset_size = 160; break; case CPUMF_CTR_SET_MT_DIAG: diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 37265f551a11..52c1fe23b823 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -344,7 +344,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { +static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), @@ -715,8 +715,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 1 ... 5: csvn = cpumcf_svn_12345_pmu_event_attr; break; - case 6: - csvn = cpumcf_svn_6_pmu_event_attr; + case 6 ... 7: + csvn = cpumcf_svn_67_pmu_event_attr; break; default: csvn = none; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index db62def4ef28..332a49965130 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1179,7 +1179,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, sample = (struct hws_basic_entry *) *sdbt; while ((unsigned long *) sample < (unsigned long *) te) { /* Check for an empty sample */ - if (!sample->def) + if (!sample->def || sample->LS) break; /* Update perf event period */ diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index a596e69d3c47..8a5d21461889 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -62,10 +62,14 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac2.as = PSW_BITS_AS_SECONDARY, + .oac2.a = 1, + }; tmp1 = -4096UL; asm volatile( - " lghi 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n" "6: jz 4f\n" "1: algr %0,%3\n" @@ -84,7 +88,7 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : [spec] "K" (0x81UL) + : [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } @@ -135,10 +139,14 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.a = 1, + }; tmp1 = -4096UL; asm volatile( - " llilh 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" "6: jz 4f\n" "1: algr %0,%3\n" @@ -157,7 +165,7 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : [spec] "K" (0x81UL) + : [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } @@ -207,10 +215,14 @@ EXPORT_SYMBOL(raw_copy_to_user); static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.a = 1, + }; tmp1 = -4096UL; asm volatile( - " llilh 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n" " jz 4f\n" "1: algr %0,%2\n" @@ -228,7 +240,7 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) - : "a" (empty_zero_page), [spec] "K" (0x81UL) + : "a" (empty_zero_page), [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } diff --git a/arch/sh/boot/Makefile b/arch/sh/boot/Makefile index 5c123f5b2797..1f5d2df3c7e0 100644 --- a/arch/sh/boot/Makefile +++ b/arch/sh/boot/Makefile @@ -19,12 +19,12 @@ CONFIG_ZERO_PAGE_OFFSET ?= 0x00001000 CONFIG_ENTRY_OFFSET ?= 0x00001000 CONFIG_PHYSICAL_START ?= $(CONFIG_MEMORY_START) -suffix-y := bin -suffix-$(CONFIG_KERNEL_GZIP) := gz -suffix-$(CONFIG_KERNEL_BZIP2) := bz2 -suffix-$(CONFIG_KERNEL_LZMA) := lzma -suffix-$(CONFIG_KERNEL_XZ) := xz -suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix_y := bin +suffix_$(CONFIG_KERNEL_GZIP) := gz +suffix_$(CONFIG_KERNEL_BZIP2) := bz2 +suffix_$(CONFIG_KERNEL_LZMA) := lzma +suffix_$(CONFIG_KERNEL_XZ) := xz +suffix_$(CONFIG_KERNEL_LZO) := lzo targets := zImage vmlinux.srec romImage uImage uImage.srec uImage.gz \ uImage.bz2 uImage.lzma uImage.xz uImage.lzo uImage.bin \ @@ -106,10 +106,10 @@ OBJCOPYFLAGS_uImage.srec := -I binary -O srec $(obj)/uImage.srec: $(obj)/uImage FORCE $(call if_changed,objcopy) -$(obj)/uImage: $(obj)/uImage.$(suffix-y) +$(obj)/uImage: $(obj)/uImage.$(suffix_y) @ln -sf $(notdir $<) $@ @echo ' Image $@ is ready' export CONFIG_PAGE_OFFSET CONFIG_MEMORY_START CONFIG_BOOT_LINK_OFFSET \ CONFIG_PHYSICAL_START CONFIG_ZERO_PAGE_OFFSET CONFIG_ENTRY_OFFSET \ - KERNEL_MEMORY suffix-y + KERNEL_MEMORY suffix_y diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile index cf3174df7859..591125c42d49 100644 --- a/arch/sh/boot/compressed/Makefile +++ b/arch/sh/boot/compressed/Makefile @@ -47,22 +47,20 @@ $(obj)/vmlinux: $(addprefix $(obj)/, $(OBJECTS)) FORCE $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -vmlinux.bin.all-y := $(obj)/vmlinux.bin - -$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) -$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) -$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) +$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE + $(call if_changed,bzip2_with_size) +$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzma_with_size) +$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE + $(call if_changed,xzkern_with_size) +$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzo_with_size) OBJCOPYFLAGS += -R .empty_zero_page LDFLAGS_piggy.o := -r --format binary --oformat $(ld-bfd) -T -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE $(call if_changed,ld) diff --git a/arch/sh/boot/dts/Makefile b/arch/sh/boot/dts/Makefile index c17d65b82abe..4a6dec9714a9 100644 --- a/arch/sh/boot/dts/Makefile +++ b/arch/sh/boot/dts/Makefile @@ -1,4 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -ifneq ($(CONFIG_BUILTIN_DTB_SOURCE),"") -obj-$(CONFIG_USE_BUILTIN_DTB) += $(patsubst "%",%,$(CONFIG_BUILTIN_DTB_SOURCE)).dtb.o -endif +obj-$(CONFIG_USE_BUILTIN_DTB) += $(addsuffix .dtb.o, $(CONFIG_BUILTIN_DTB_SOURCE)) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 66fc08646be5..1cab1b284f1a 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -97,6 +97,9 @@ config SPARC64 select PCI_DOMAINS if PCI select ARCH_HAS_GIGANTIC_PAGE select HAVE_SOFTIRQ_ON_OWN_STACK + select HAVE_SETUP_PER_CPU_AREA + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK config ARCH_PROC_KCORE_TEXT def_bool y @@ -123,15 +126,6 @@ config AUDIT_ARCH bool default y -config HAVE_SETUP_PER_CPU_AREA - def_bool y if SPARC64 - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y if SPARC64 - -config NEED_PER_CPU_PAGE_FIRST_CHUNK - def_bool y if SPARC64 - config MMU bool default y diff --git a/arch/sparc/kernel/led.c b/arch/sparc/kernel/led.c index 3a66e62eb2a0..ab657b359789 100644 --- a/arch/sparc/kernel/led.c +++ b/arch/sparc/kernel/led.c @@ -114,18 +114,16 @@ static const struct proc_ops led_proc_ops = { }; #endif -static struct proc_dir_entry *led; - #define LED_VERSION "0.1" static int __init led_init(void) { timer_setup(&led_blink_timer, led_blink, 0); - led = proc_create("led", 0, NULL, &led_proc_ops); - if (!led) +#ifdef CONFIG_PROC_FS + if (!proc_create("led", 0, NULL, &led_proc_ops)) return -ENOMEM; - +#endif printk(KERN_INFO "led: version %s, Lars Kotthoff <metalhead@metalhead.ws>\n", LED_VERSION); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index b98a7bbe6728..a1f78e9ddaf3 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1526,50 +1526,6 @@ void smp_send_stop(void) smp_call_function(stop_this_cpu, NULL, 0); } -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, - size_t align) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = cpu_to_node(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -static void __init pcpu_free_bootmem(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { if (cpu_to_node(from) == cpu_to_node(to)) @@ -1578,57 +1534,9 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static void __init pcpu_populate_pte(unsigned long addr) +static int __init pcpu_cpu_to_node(int cpu) { - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - if (pgd_none(*pgd)) { - pud_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pgd_populate(&init_mm, pgd, new); - } - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - p4d_populate(&init_mm, p4d, new); - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, new); - } - - return; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + return cpu_to_node(cpu); } void __init setup_per_cpu_areas(void) @@ -1641,8 +1549,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, 4 << 20, pcpu_cpu_distance, - pcpu_alloc_bootmem, - pcpu_free_bootmem); + pcpu_cpu_to_node); if (rc) pr_warn("PERCPU: %s allocator failed (%d), " "falling back to page size\n", @@ -1650,9 +1557,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, - pcpu_alloc_bootmem, - pcpu_free_bootmem, - pcpu_populate_pte); + pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 0ab58016db22..5c092a9153ea 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -616,7 +616,7 @@ static void um_pci_virtio_remove(struct virtio_device *vdev) int i; /* Stop all virtqueues */ - vdev->config->reset(vdev); + virtio_reset_device(vdev); vdev->config->del_vqs(vdev); device_set_wakeup_enable(&vdev->dev, false); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 407533c835fe..6fddb63271d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -78,7 +78,7 @@ config X86 select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_KCOV if X86_64 && STACK_VALIDATION + select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE @@ -240,6 +240,7 @@ config X86 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR select HAVE_STACK_VALIDATION if X86_64 @@ -253,6 +254,8 @@ config X86 select HAVE_GENERIC_VDSO select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH select PCI_DOMAINS if PCI select PCI_LOCKLESS_CONFIG if PCI @@ -333,15 +336,6 @@ config ARCH_HAS_CPU_RELAX config ARCH_HAS_FILTER_PGPROT def_bool y -config HAVE_SETUP_PER_CPU_AREA - def_bool y - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y - -config NEED_PER_CPU_PAGE_FIRST_CHUNK - def_bool y - config ARCH_HIBERNATION_POSSIBLE def_bool y @@ -1575,6 +1569,7 @@ config NUMA depends on SMP depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) default y if X86_BIGSMP + select USE_PERCPU_NUMA_NODE_ID help Enable NUMA (Non-Uniform Memory Access) support. @@ -2450,10 +2445,6 @@ config ARCH_HAS_ADD_PAGES config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE def_bool y -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e11813646051..6115274fe10f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -126,17 +126,17 @@ vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) + $(call if_changed,bzip2_with_size) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) + $(call if_changed,lzma_with_size) $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) + $(call if_changed,xzkern_with_size) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) + $(call if_changed,lzo_with_size) $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lz4) + $(call if_changed,lz4_with_size) $(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE - $(call if_changed,zstd22) + $(call if_changed,zstd22_with_size) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index f658bb4dbb74..631d5040b31e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush) KVM_X86_OP_NULL(tlb_remote_flush_with_range) KVM_X86_OP(tlb_flush_gva) KVM_X86_OP(tlb_flush_guest) +KVM_X86_OP(vcpu_pre_run) KVM_X86_OP(run) KVM_X86_OP_NULL(handle_exit) KVM_X86_OP_NULL(skip_emulated_instruction) @@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff) KVM_X86_OP_NULL(request_immediate_exit) KVM_X86_OP(sched_in) KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(pre_block) -KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0677b9ea01c9..1384517d7709 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1381,6 +1381,7 @@ struct kvm_x86_ops { */ void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); @@ -1454,18 +1455,6 @@ struct kvm_x86_ops { const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; - /* - * Architecture specific hooks for vCPU blocking due to - * HLT instruction. - * Returns for .pre_block(): - * - 0 means continue to block the vCPU. - * - 1 means we cannot block the vCPU since some event - * happens during this period, such as, 'ON' bit in - * posted-interrupts descriptor is set. - */ - int (*pre_block)(struct kvm_vcpu *vcpu); - void (*post_block)(struct kvm_vcpu *vcpu); - void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index fd2d3ab38ebb..dc7da08bc700 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -515,6 +515,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = { .stolen_size = gen9_stolen_size, }; +/* Intel integrated GPUs for which we need to reserve "stolen memory" */ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_I830_IDS(&i830_early_ops), INTEL_I845G_IDS(&i845_early_ops), @@ -592,6 +593,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func) u16 device; int i; + /* + * Reserve "stolen memory" for an integrated GPU. If we've already + * found one, there's nothing to do for other (discrete) GPUs. + */ + if (resource_size(&intel_graphics_stolen_res)) + return; + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { @@ -704,7 +712,7 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, - QFLAG_APPLY_ONCE, intel_graphics_quirks }, + 0, intel_graphics_quirks }, /* * HPET on the current version of the Baytrail platform has accuracy * problems: it will halt in deep idle state - so we disable it. diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 882213df3713..71f336425e58 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1435,8 +1435,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) hpet_rtc_timer_reinit(); memset(&curr_time, 0, sizeof(struct rtc_time)); - if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - mc146818_get_time(&curr_time); + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { + if (unlikely(mc146818_get_time(&curr_time) < 0)) { + pr_err_ratelimited("unable to read current time from RTC\n"); + return IRQ_HANDLED; + } + } if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 9b9fb7882c20..9ae64f9af956 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/dmi.h> #include <linux/ioport.h> #include <asm/e820/api.h> @@ -23,11 +24,31 @@ static void resource_clip(struct resource *res, resource_size_t start, res->start = end + 1; } +/* + * Some BIOS-es contain a bug where they add addresses which map to + * system RAM in the PCI host bridge window returned by the ACPI _CRS + * method, see commit 4dc2287c1805 ("x86: avoid E820 regions when + * allocating address space"). To avoid this Linux by default excludes + * E820 reservations when allocating addresses since 2010. + * In 2019 some systems have shown-up with E820 reservations which cover + * the entire _CRS returned PCI host bridge window, causing all attempts + * to assign memory to PCI BARs to fail if Linux uses E820 reservations. + * + * Ideally Linux would fully stop using E820 reservations, but then + * the old systems this was added for will regress. + * Instead keep the old behavior for old systems, while ignoring the + * E820 reservations for any systems from now on. + */ static void remove_e820_regions(struct resource *avail) { - int i; + int i, year = dmi_get_bios_year(); struct e820_entry *entry; + if (year >= 2018) + return; + + pr_info_once("PCI: Removing E820 reservations from host bridge windows\n"); + for (i = 0; i < e820_table->nr_entries; i++) { entry = &e820_table->entries[i]; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7b65275544b2..49325caa7307 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -84,60 +84,6 @@ static bool __init pcpu_need_numa(void) } #endif -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, - unsigned long align) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = early_cpu_to_node(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, - node); - - pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", - cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -/* - * Helpers for first chunk memory allocation - */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) -{ - return pcpu_alloc_bootmem(cpu, size, align); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { #ifdef CONFIG_NUMA @@ -150,7 +96,12 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) #endif } -static void __init pcpup_populate_pte(unsigned long addr) +static int __init pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + +void __init pcpu_populate_pte(unsigned long addr) { populate_extra_pte(addr); } @@ -205,15 +156,14 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_fc_alloc, pcpu_fc_free); + pcpu_cpu_to_node); if (rc < 0) pr_warn("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpup_populate_pte); + pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c55e57b30e81..3902c28fb6cb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } +/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ +static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, + int nent) +{ + struct kvm_cpuid_entry2 *orig; + int i; + + if (nent != vcpu->arch.cpuid_nent) + return -EINVAL; + + for (i = 0; i < nent; i++) { + orig = &vcpu->arch.cpuid_entries[i]; + if (e2[i].function != orig->function || + e2[i].index != orig->index || + e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || + e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) + return -EINVAL; + } + + return 0; +} + static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) { u32 function; @@ -145,14 +167,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) } } -static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entries, int nent) { u32 base = vcpu->arch.kvm_cpuid_base; if (!base) return NULL; - return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); +} + +static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -167,11 +196,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, + int nent) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, 0); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -182,33 +212,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); } - best = kvm_find_cpuid_entry(vcpu, 7, 0); + best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + best = cpuid_entry2_find(entries, nent, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_kvm_cpuid_features(vcpu); + best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, 0); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } } + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); +} EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) @@ -298,6 +333,22 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, { int r; + __kvm_update_cpuid_runtime(vcpu, e2, nent); + + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with + * the core vCPU model on the fly. It would've been better to forbid any + * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately + * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do + * KVM_SET_CPUID{,2} again. To support this legacy behavior, check + * whether the supplied CPUID data is equal to what's already set. + */ + if (vcpu->arch.last_vmentry_cpu != -1) + return kvm_cpuid_check_equal(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu, e2, nent); if (r) return r; @@ -307,7 +358,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; kvm_update_kvm_cpuid_base(vcpu); - kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); return 0; @@ -795,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) perf_get_x86_pmu_capability(&cap); /* - * Only support guest architectural pmu on a host - * with architectural pmu. + * The guest architecture pmu is only supported if the architecture + * pmu exists on the host and the module parameters allow it. */ - if (!cap.version) + if (!cap.version || !enable_pmu) memset(&cap, 0, sizeof(cap)); eax.split.version_id = min(cap.version, 2); @@ -886,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) --array->nent; continue; } + + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + entry->ecx &= ~BIT_ULL(2); entry->edx = 0; } break; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c5028e6b0f96..baca9fa37a91 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { restart_apic_timer(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) { @@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) start_sw_timer(apic); preempt_enable(); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1d275e9d76b5..593093b52395 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } @@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } /* - * We can flush all the TLBs out of the mmu lock without TLB - * corruption since we just change the spte from writable to - * readonly so that we only need to care the case of changing - * spte from present to present (changing the spte from present - * to nonpresent will flush all the TLBs immediately), in other - * words, the only case we care is mmu_spte_update() where we - * have checked Host-writable | MMU-writable instead of - * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK - * anymore. + * Flush TLBs if any SPTEs had to be write-protected to ensure that + * guest writes are reflected in the dirty bitmap before the memslot + * update completes, i.e. before enabling dirty logging is visible to + * userspace. + * + * Perform the TLB flush outside the mmu_lock to reduce the amount of + * time the lock is held. However, this does mean that another CPU can + * now grab mmu_lock and encounter a write-protected SPTE while CPUs + * still have a writable mapping for the associated GFN in their TLB. + * + * This is safe but requires KVM to be careful when making decisions + * based on the write-protection status of an SPTE. Specifically, KVM + * also write-protects SPTEs to monitor changes to guest page tables + * during shadow paging, and must guarantee no CPUs can write to those + * page before the lock is dropped. As mentioned in the previous + * paragraph, a write-protected SPTE is no guarantee that CPU cannot + * perform writes. So to determine if a TLB flush is truly required, KVM + * will clear a separate software-only bit (MMU-writable) and skip the + * flush if-and-only-if this bit was already clear. + * + * See DEFAULT_SPTE_MMU_WRITEABLE for more details. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 351b04ad62a1..73cfe62fdad1 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~shadow_host_writable_mask; + new_spte &= ~shadow_mmu_writable_mask; new_spte = mark_spte_for_access_track(new_spte); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index a4af2a42695c..be6a007a4af3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) -/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ -#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) -#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) - /* * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the @@ -79,6 +75,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); /* + * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits + * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs + * that map guest pages in read-only memslots and read-only VMAs. + * + * Invariants: + * - If Host-writable is clear, PT_WRITABLE_MASK must be clear. + * + * + * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU + * allows writes to the guest page mapped by the SPTE. This bit is cleared when + * the guest page mapped by the SPTE contains a page table that is being + * monitored for shadow paging. In this case the SPTE can only be made writable + * by unsyncing the shadow page under the mmu_lock. + * + * Invariants: + * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear. + * - If MMU-writable is set, Host-writable must be set. + * + * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared + * to track writes for dirty logging. For such SPTEs, KVM will locklessly set + * PT_WRITABLE_MASK upon the next write from the guest and record the write in + * the dirty log (see fast_page_fault()). + */ + +/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) + +/* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care * to not overlap the A/D type mask or the saved access bits of access-tracked * SPTEs when A/D bits are disabled. @@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, static inline bool spte_can_locklessly_be_made_writable(u64 spte) { - return (spte & shadow_host_writable_mask) && - (spte & shadow_mmu_writable_mask); + if (spte & shadow_mmu_writable_mask) { + WARN_ON_ONCE(!(spte & shadow_host_writable_mask)); + return true; + } + + WARN_ON_ONCE(spte & PT_WRITABLE_MASK); + return false; } static inline u64 get_mmio_spte_generation(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b1bc816b7c3..bc9e3553fba2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - if (!is_writable_pte(iter.old_spte)) - break; - new_spte = iter.old_spte & ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); + if (new_spte == iter.old_spte) + break; + tdp_mmu_set_spte(kvm, &iter, new_spte); spte_set = true; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 261b39cbef6e..f614f95acc6b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -13,6 +13,8 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <linux/bsearch.h> +#include <linux/sort.h> #include <asm/perf_event.h> #include "x86.h" #include "cpuid.h" @@ -109,6 +111,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .config = config, }; + if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) + return; + attr.sample_period = get_sample_period(pmc, pmc->counter); if (in_tx) @@ -169,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } +static int cmp_u64(const void *a, const void *b) +{ + return *(__u64 *)a - *(__u64 *)b; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; - int i; bool allow_event = true; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) @@ -189,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (filter) { - for (i = 0; i < filter->nevents; i++) - if (filter->events[i] == - (eventsel & AMD64_RAW_EVENT_MASK_NB)) - break; - if (filter->action == KVM_PMU_EVENT_ALLOW && - i == filter->nevents) - allow_event = false; - if (filter->action == KVM_PMU_EVENT_DENY && - i < filter->nevents) - allow_event = false; + __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; + + if (bsearch(&key, filter->events, filter->nevents, + sizeof(__u64), cmp_u64)) + allow_event = filter->action == KVM_PMU_EVENT_ALLOW; + else + allow_event = filter->action == KVM_PMU_EVENT_DENY; } if (!allow_event) return; @@ -573,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) /* Ensure nevents can't be changed between the user copies. */ *filter = tmp; + /* + * Sort the in-kernel list so that we can search it with bsearch. + */ + sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0e5b49294086..90364d02f22a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, struct kvm_vcpu *vcpu; unsigned long i; + /* + * Wake any target vCPUs that are blocking, i.e. waiting for a wake + * event. There's no need to signal doorbells, as hardware has handled + * vCPUs that were in guest at the time of the IPI, and vCPUs that have + * since entered the guest will have processed pending IRQs at VMRUN. + */ kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, source, - icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK); - - if (m && !avic_vcpu_is_running(vcpu)) + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & APIC_DEST_MASK)) kvm_vcpu_wake_up(vcpu); } } @@ -672,9 +675,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) return -1; kvm_lapic_set_irr(vec, vcpu->arch.apic); + + /* + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before + * the read of guest_mode, which guarantees that either VMRUN will see + * and process the new vIRR entry, or that the below code will signal + * the doorbell if the vCPU is already running in the guest. + */ smp_mb__after_atomic(); - if (avic_vcpu_is_running(vcpu)) { + /* + * Signal the doorbell to tell hardware to inject the IRQ if the vCPU + * is in the guest. If the vCPU is not in the guest, hardware will + * automatically process AVIC interrupts at VMRUN. + */ + if (vcpu->mode == IN_GUEST_MODE) { int cpu = READ_ONCE(vcpu->cpu); /* @@ -688,8 +704,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) if (cpu != get_cpu()) wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); put_cpu(); - } else + } else { + /* + * Wake the vCPU if it was blocking. KVM will then detect the + * pending IRQ when checking if the vCPU has a wake event. + */ kvm_vcpu_wake_up(vcpu); + } return 0; } @@ -957,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -964,19 +987,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; + /* + * No need to update anything if the vCPU is blocking, i.e. if the vCPU + * is being scheduled in after being preempted. The CPU entries in the + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. + * If the vCPU was migrated, its new CPU value will be stuffed when the + * vCPU unblocks. + */ + if (kvm_vcpu_is_blocking(vcpu)) + return; + entry = READ_ONCE(*(svm->avic_physical_id_cache)); WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (svm->avic_is_running) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, - svm->avic_is_running); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -984,42 +1013,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + + /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + return; + + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -/* - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +void avic_vcpu_blocking(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - int cpu = get_cpu(); - - WARN_ON(cpu != vcpu->cpu); - svm->avic_is_running = is_run; + if (!kvm_vcpu_apicv_active(vcpu)) + return; - if (kvm_vcpu_apicv_active(vcpu)) { - if (is_run) - avic_vcpu_load(vcpu, cpu); - else - avic_vcpu_put(vcpu); - } - put_cpu(); + preempt_disable(); + + /* + * Unload the AVIC when the vCPU is about to block, _before_ + * the vCPU actually blocks. + * + * Any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source, therefore vIRR will also + * be checked by kvm_vcpu_check_block() before blocking. The + * memory barrier implicit in set_current_state orders writing + * IsRunning=0 before reading the vIRR. The processor needs a + * matching memory barrier on interrupt delivery between writing + * IRR and reading IsRunning; the lack of this barrier might be + * the cause of errata #1235). + */ + avic_vcpu_put(vcpu); + + preempt_enable(); } -void svm_vcpu_blocking(struct kvm_vcpu *vcpu) +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) { - avic_set_running(vcpu, false); -} + int cpu; -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) -{ - if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) - kvm_vcpu_update_apicv(vcpu); - avic_set_running(vcpu, true); + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + cpu = get_cpu(); + WARN_ON(cpu != vcpu->cpu); + + avic_vcpu_load(vcpu, cpu); + + put_cpu(); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 12d8b301065a..5aa45f13b16d 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - if (!pmu) + if (!enable_pmu) return NULL; switch (msr) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 46bcc706f257..2c99b18d76c0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -192,10 +192,6 @@ module_param(vgif, int, 0444); static int lbrv = true; module_param(lbrv, int, 0444); -/* enable/disable PMU virtualization */ -bool pmu = true; -module_param(pmu, bool, 0444); - static int tsc_scaling = true; module_param(tsc_scaling, int, 0444); @@ -873,47 +869,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * The default MMIO mask is a single bit (excluding the present bit), - * which could conflict with the memory encryption bit. Check for - * memory encryption support and override the default MMIO mask if - * memory encryption is enabled. - */ -static __init void svm_adjust_mmio_mask(void) -{ - unsigned int enc_bit, mask_bit; - u64 msr, mask; - - /* If there is no memory encryption support, use existing mask */ - if (cpuid_eax(0x80000000) < 0x8000001f) - return; - - /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); - if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) - return; - - enc_bit = cpuid_ebx(0x8000001f) & 0x3f; - mask_bit = boot_cpu_data.x86_phys_bits; - - /* Increment the mask bit if it is the same as the encryption bit */ - if (enc_bit == mask_bit) - mask_bit++; - - /* - * If the mask bit location is below 52, then some bits above the - * physical addressing limit will always be reserved, so use the - * rsvd_bits() function to generate the mask. This mask, along with - * the present bit, will be used to generate a page fault with - * PFER.RSV = 1. - * - * If the mask bit location is 52 (or above), then clear the mask. - */ - mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - - kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); -} - static void svm_hardware_teardown(void) { int cpu; @@ -928,198 +883,6 @@ static void svm_hardware_teardown(void) iopm_base = 0; } -static __init void svm_set_cpu_caps(void) -{ - kvm_set_cpu_caps(); - - supported_xss = 0; - - /* CPUID 0x80000001 and 0x8000000A (SVM features) */ - if (nested) { - kvm_cpu_cap_set(X86_FEATURE_SVM); - - if (nrips) - kvm_cpu_cap_set(X86_FEATURE_NRIPS); - - if (npt_enabled) - kvm_cpu_cap_set(X86_FEATURE_NPT); - - if (tsc_scaling) - kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); - - /* Nested VM can receive #VMEXIT instead of triggering #GP */ - kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); - } - - /* CPUID 0x80000008 */ - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - - /* AMD PMU PERFCTR_CORE CPUID */ - if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); - - /* CPUID 0x8000001F (SME/SEV features) */ - sev_set_cpu_caps(); -} - -static __init int svm_hardware_setup(void) -{ - int cpu; - struct page *iopm_pages; - void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); - - /* - * NX is required for shadow paging and for NPT if the NX huge pages - * mitigation is enabled. - */ - if (!boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("NX (Execute Disable) not supported\n"); - return -EOPNOTSUPP; - } - kvm_enable_efer_bits(EFER_NX); - - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - init_msrpm_offsets(); - - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) - kvm_enable_efer_bits(EFER_FFXSR); - - if (tsc_scaling) { - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - tsc_scaling = false; - } else { - pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; - } - } - - tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); - - /* Check for pause filtering support */ - if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - pause_filter_count = 0; - pause_filter_thresh = 0; - } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { - pause_filter_thresh = 0; - } - - if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); - } - - /* - * KVM's MMU doesn't support using 2-level paging for itself, and thus - * NPT isn't supported if the host is using 2-level paging since host - * CR4 is unchanged on VMRUN. - */ - if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) - npt_enabled = false; - - if (!boot_cpu_has(X86_FEATURE_NPT)) - npt_enabled = false; - - /* Force VM NPT level equal to the host's paging level */ - kvm_configure_mmu(npt_enabled, get_npt_level(), - get_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); - - /* Note, SEV setup consumes npt_enabled. */ - sev_hardware_setup(); - - svm_hv_hardware_setup(); - - svm_adjust_mmio_mask(); - - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - - enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); - - if (enable_apicv) { - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } - - if (vls) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || - !IS_ENABLED(CONFIG_X86_64)) { - vls = false; - } else { - pr_info("Virtual VMLOAD VMSAVE supported\n"); - } - } - - if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) - svm_gp_erratum_intercept = false; - - if (vgif) { - if (!boot_cpu_has(X86_FEATURE_VGIF)) - vgif = false; - else - pr_info("Virtual GIF supported\n"); - } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - - if (!pmu) - pr_info("PMU virtualization is disabled\n"); - - svm_set_cpu_caps(); - - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; - - return 0; - -err: - svm_hardware_teardown(); - return r; -} - static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; @@ -1444,12 +1207,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (err) goto error_free_vmsa_page; - /* We initialize this flag to true to make sure that the is_running - * bit would be set the first time the vcpu is loaded. - */ - if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) - svm->avic_is_running = true; - svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; @@ -3833,6 +3590,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(vcpu); } +static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + return 1; +} + static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && @@ -4629,8 +4391,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .vcpu_blocking = svm_vcpu_blocking, - .vcpu_unblocking = svm_vcpu_unblocking, + .vcpu_blocking = avic_vcpu_blocking, + .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, @@ -4662,6 +4424,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .tlb_flush_gva = svm_flush_tlb_gva, .tlb_flush_guest = svm_flush_tlb, + .vcpu_pre_run = svm_vcpu_pre_run, .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, @@ -4742,6 +4505,243 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, }; +/* + * The default MMIO mask is a single bit (excluding the present bit), + * which could conflict with the memory encryption bit. Check for + * memory encryption support and override the default MMIO mask if + * memory encryption is enabled. + */ +static __init void svm_adjust_mmio_mask(void) +{ + unsigned int enc_bit, mask_bit; + u64 msr, mask; + + /* If there is no memory encryption support, use existing mask */ + if (cpuid_eax(0x80000000) < 0x8000001f) + return; + + /* If memory encryption is not enabled, use existing mask */ + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) + return; + + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; + mask_bit = boot_cpu_data.x86_phys_bits; + + /* Increment the mask bit if it is the same as the encryption bit */ + if (enc_bit == mask_bit) + mask_bit++; + + /* + * If the mask bit location is below 52, then some bits above the + * physical addressing limit will always be reserved, so use the + * rsvd_bits() function to generate the mask. This mask, along with + * the present bit, will be used to generate a page fault with + * PFER.RSV = 1. + * + * If the mask bit location is 52 (or above), then clear the mask. + */ + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); +} + +static __init void svm_set_cpu_caps(void) +{ + kvm_set_cpu_caps(); + + supported_xss = 0; + + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ + if (nested) { + kvm_cpu_cap_set(X86_FEATURE_SVM); + + if (nrips) + kvm_cpu_cap_set(X86_FEATURE_NRIPS); + + if (npt_enabled) + kvm_cpu_cap_set(X86_FEATURE_NPT); + + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + + /* Nested VM can receive #VMEXIT instead of triggering #GP */ + kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); + } + + /* CPUID 0x80000008 */ + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* AMD PMU PERFCTR_CORE CPUID */ + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + + /* CPUID 0x8000001F (SME/SEV features) */ + sev_set_cpu_caps(); +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + void *iopm_va; + int r; + unsigned int order = get_order(IOPM_SIZE); + + /* + * NX is required for shadow paging and for NPT if the NX huge pages + * mitigation is enabled. + */ + if (!boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + kvm_enable_efer_bits(EFER_NX); + + iopm_pages = alloc_pages(GFP_KERNEL, order); + + if (!iopm_pages) + return -ENOMEM; + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + init_msrpm_offsets(); + + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) + kvm_enable_efer_bits(EFER_FFXSR); + + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; + } + } + + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); + + /* Check for pause filtering support */ + if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { + pause_filter_count = 0; + pause_filter_thresh = 0; + } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { + pause_filter_thresh = 0; + } + + if (nested) { + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + } + + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) + npt_enabled = false; + + if (!boot_cpu_has(X86_FEATURE_NPT)) + npt_enabled = false; + + /* Force VM NPT level equal to the host's paging level */ + kvm_configure_mmu(npt_enabled, get_npt_level(), + get_npt_level(), PG_LEVEL_1G); + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + + /* Note, SEV setup consumes npt_enabled. */ + sev_hardware_setup(); + + svm_hv_hardware_setup(); + + svm_adjust_mmio_mask(); + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + + if (nrips) { + if (!boot_cpu_has(X86_FEATURE_NRIPS)) + nrips = false; + } + + enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); + + if (enable_apicv) { + pr_info("AVIC enabled\n"); + + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } else { + svm_x86_ops.vcpu_blocking = NULL; + svm_x86_ops.vcpu_unblocking = NULL; + } + + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + + if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) + svm_gp_erratum_intercept = false; + + if (vgif) { + if (!boot_cpu_has(X86_FEATURE_VGIF)) + vgif = false; + else + pr_info("Virtual GIF supported\n"); + } + + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + + if (!enable_pmu) + pr_info("PMU virtualization is disabled\n"); + + svm_set_cpu_caps(); + + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + + return 0; + +err: + svm_hardware_teardown(); + return r; +} + + static struct kvm_x86_init_ops svm_init_ops __initdata = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9f153c59f2c8..47ef8f4a9358 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -32,7 +32,6 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern bool intercept_smi; -extern bool pmu; /* * Clean bits in VMCB. @@ -226,7 +225,6 @@ struct vcpu_svm { u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; - bool avic_is_running; /* * Per-vcpu list of struct amd_svm_iommu_ir: @@ -574,17 +572,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 *entry = svm->avic_physical_id_cache; - - if (!entry) - return false; - - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); -} - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -605,8 +592,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); -void svm_vcpu_blocking(struct kvm_vcpu *vcpu); -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_vcpu_blocking(struct kvm_vcpu *vcpu); +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); /* sev.c */ diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c8029b7845b6..959b59d13b5a 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -5,6 +5,7 @@ #include <asm/vmx.h> #include "lapic.h" +#include "x86.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void) { u64 perf_cap = 0; + if (!enable_pmu) + return perf_cap; + if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5e0ac57d6d1b..466d18fc0c5d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -21,7 +21,6 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) static struct kvm_event_hw_type_mapping intel_arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, @@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + /* The above index must match CPUID 0x0A.EBX bit vector */ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; @@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select && - intel_arch_events[i].unit_mask == unit_mask && - (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) - break; + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { + if (intel_arch_events[i].eventsel != event_select || + intel_arch_events[i].unit_mask != unit_mask) + continue; + + /* disable event that reported as not present by cpuid */ + if ((i < 7) && !(pmu->available_event_types & (1 << i))) + return PERF_COUNT_HW_MAX + 1; + + break; + } if (i == ARRAY_SIZE(intel_arch_events)) return PERF_COUNT_HW_MAX; @@ -481,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) + if (!entry || !enable_pmu) return; eax.full = entry->eax; edx.full = entry->edx; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 88c53c521094..aa1fe9085d77 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -19,7 +19,7 @@ * wake the target vCPUs. vCPUs are removed from the list and the notification * vector is reset when the vCPU is scheduled in. */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); /* * Protect the per-CPU list with a per-CPU spinlock to handle task migration. * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the @@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will * occur if a wakeup IRQ arrives and attempts to acquire the lock. */ -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); +static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { @@ -51,7 +51,9 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; + unsigned long flags; unsigned int dest; /* @@ -62,23 +64,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!enable_apicv || !lapic_in_kernel(vcpu)) return; - /* Nothing to do if PI.SN and PI.NDST both have the desired value. */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + /* + * If the vCPU wasn't on the wakeup list and wasn't migrated, then the + * full update can be skipped as neither the vector nor the destination + * needs to be changed. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { + /* + * Clear SN if it was set due to being preempted. Again, do + * this even if there is no assigned device for simplicity. + */ + if (pi_test_and_clear_sn(pi_desc)) + goto after_clear_sn; return; + } + + local_irq_save(flags); /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. + * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup + * list of the _previous_ pCPU, which will not be the same as the + * current pCPU if the task was migrated. */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_del(&vmx->pi_wakeup_list); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); } - /* The full case. Set the new destination and clear SN. */ dest = cpu_physical_id(cpu); if (!x2apic_mode) dest = (dest << 8) & 0xFF00; @@ -86,10 +99,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) do { old.control = new.control = READ_ONCE(pi_desc->control); + /* + * Clear SN (as above) and refresh the destination APIC ID to + * handle task migration (@cpu != vcpu->cpu). + */ new.ndst = dest; new.sn = 0; + + /* + * Restore the notification vector; in the blocking case, the + * descriptor was modified on "put" to use the wakeup vector. + */ + new.nv = POSTED_INTR_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); + local_irq_restore(flags); + after_clear_sn: /* @@ -111,83 +136,25 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) irq_remapping_cap(IRQ_POSTING_CAP); } -void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!vmx_can_use_vtd_pi(vcpu->kvm)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * Remove the vCPU from the wakeup list of the _previous_ pCPU, which - * will not be the same as the current pCPU if the task was migrated. - */ - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - - dest = cpu_physical_id(vcpu->cpu); - if (!x2apic_mode) - dest = (dest << 8) & 0xFF00; - - WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the vCPU was blocking"); - - do { - old.control = new.control = READ_ONCE(pi_desc->control); - - new.ndst = dest; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); - - vcpu->pre_pcpu = -1; -} - /* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set + * WAKEUP as the notification vector in the PI descriptor. */ -int pi_pre_block(struct kvm_vcpu *vcpu) +static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { - struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct pi_desc old, new; unsigned long flags; - if (!vmx_can_use_vtd_pi(vcpu->kvm) || - vmx_interrupt_blocked(vcpu)) - return 0; - local_irq_save(flags); - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_add_tail(&vmx->pi_wakeup_list, + &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - WARN(pi_desc->sn == 1, - "Posted Interrupt Suppress Notification set before blocking"); + WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); do { old.control = new.control = READ_ONCE(pi_desc->control); @@ -196,24 +163,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu) new.nv = POSTED_INTR_WAKEUP_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc)) - __pi_post_block(vcpu); + /* + * Send a wakeup IPI to this CPU if an interrupt may have been posted + * before the notification vector was updated, in which case the IRQ + * will arrive on the non-wakeup vector. An IPI is needed as calling + * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not + * enabled until it is safe to call try_to_wake_up() on the task being + * scheduled out). + */ + if (pi_test_on(&new)) + apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); local_irq_restore(flags); - return (vcpu->pre_pcpu == -1); } -void pi_post_block(struct kvm_vcpu *vcpu) +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { - unsigned long flags; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (vcpu->pre_pcpu == -1) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; - local_irq_save(flags); - __pi_post_block(vcpu); - local_irq_restore(flags); + if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + pi_enable_wakeup_handler(vcpu); + + /* + * Set SN when the vCPU is preempted. Note, the vCPU can both be seen + * as blocking and preempted, e.g. if it's preempted between setting + * its wait state and manually scheduling out. + */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } /* @@ -221,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu) */ void pi_wakeup_handler(void) { - struct kvm_vcpu *vcpu; int cpu = smp_processor_id(); + struct vcpu_vmx *vmx; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); + list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu), + pi_wakeup_list) { - if (pi_test_on(pi_desc)) - kvm_vcpu_kick(vcpu); + if (pi_test_on(&vmx->pi_desc)) + kvm_vcpu_wake_up(&vmx->vcpu); } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } void __init pi_init_cpu(int cpu) { - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); + raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) @@ -254,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) * Bail out of the block loop if the VM has an assigned * device, but the blocking vCPU didn't reconfigure the * PI.NV to the wakeup vector, i.e. the assigned device - * came along after the initial check in pi_pre_block(). + * came along after the initial check in vmx_vcpu_pi_put(). */ void vmx_pi_start_assignment(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 36ae035f14aa..eb14e76b84ef 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); @@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); -int pi_pre_block(struct kvm_vcpu *vcpu); -void pi_post_block(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1b2e9d8c5cc9..4ac676066d60 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3931,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) { #ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - if (vcpu->mode == IN_GUEST_MODE) { /* * The vector of interrupt to be delivered to vcpu had @@ -3964,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, */ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; + return; } #endif - return false; + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -3997,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0; } return -1; @@ -4035,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); - + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); return 0; } @@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } +static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending; +} + static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) { + if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } +static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + return 1; +} + static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6928,6 +6945,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); + INIT_LIST_HEAD(&vmx->pi_wakeup_list); + err = -ENOMEM; vmx->vpid = allocate_vpid(); @@ -7549,25 +7568,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - if (kvm_x86_ops.set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7710,6 +7710,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, + .vcpu_pre_run = vmx_vcpu_pre_run, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, @@ -7768,9 +7769,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f8fc7441baea..7f2c82e7f38f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -317,6 +317,9 @@ struct vcpu_vmx { /* Posted interrupt descriptor */ struct pi_desc pi_desc; + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76b4803dd3bd..9e43d756312f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_GPL(enable_pmu); +module_param(enable_pmu, bool, 0444); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU @@ -5230,17 +5235,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; - /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with - * the core vCPU model on the fly, so fail. - */ - r = -EINVAL; - if (vcpu->arch.last_vmentry_cpu != -1) - goto out; - r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5251,14 +5245,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; - /* - * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in - * KVM_SET_CPUID case above. - */ - r = -EINVAL; - if (vcpu->arch.last_vmentry_cpu != -1) - goto out; - r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -9945,10 +9931,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_mb__after_srcu_read_unlock(); /* - * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. Assigned devices can - * use the POSTED_INTR_VECTOR even if APICv is disabled, - * so do it even if APICv is disabled on this vCPU. + * Process pending posted interrupts to handle the case where the + * notification IRQ arrived in the host, or was never sent (because the + * target vCPU wasn't running). Do this regardless of the vCPU's APICv + * status, KVM doesn't update assigned devices when APICv is inhibited, + * i.e. they can post interrupts even if APICv is temporarily disabled. */ if (kvm_lapic_enabled(vcpu)) static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); @@ -10113,8 +10100,20 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + bool hv_timer; + + if (!kvm_arch_vcpu_runnable(vcpu)) { + /* + * Switch to the software timer before halt-polling/blocking as + * the guest's timer may be a break event for the vCPU, and the + * hypervisor timer runs only when the CPU is in guest mode. + * Switch before halt-polling so that KVM recognizes an expired + * timer before blocking. + */ + hv_timer = kvm_lapic_hv_timer_in_use(vcpu); + if (hv_timer) + kvm_lapic_switch_to_sw_timer(vcpu); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) kvm_vcpu_halt(vcpu); @@ -10122,8 +10121,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_x86_ops.post_block) - static_call(kvm_x86_post_block)(vcpu); + if (hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; @@ -10316,6 +10315,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } + /* + * It should be impossible for the hypervisor timer to be in + * use before KVM has ever run the vCPU. + */ + WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); kvm_vcpu_block(vcpu); if (kvm_apic_accept_events(vcpu) < 0) { r = 0; @@ -10360,10 +10364,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - if (kvm_run->immediate_exit) + if (kvm_run->immediate_exit) { r = -EINTR; - else - r = vcpu_run(vcpu); + goto out; + } + + r = static_call(kvm_x86_vcpu_pre_run)(vcpu); + if (r <= 0) + goto out; + + r = vcpu_run(vcpu); out: kvm_put_guest_fpu(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index bec8ed090abc..635b75f9e145 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -336,6 +336,7 @@ extern u64 host_xcr0; extern u64 supported_xcr0; extern u64 host_xss; extern u64 supported_xss; +extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { diff --git a/arch/xtensa/Makefile b/arch/xtensa/Makefile index 9778216d6e09..ee2769519eaf 100644 --- a/arch/xtensa/Makefile +++ b/arch/xtensa/Makefile @@ -12,7 +12,7 @@ # Core configuration. # (Use VAR=<xtensa_config> to use another default compiler.) -variant-y := $(patsubst "%",%,$(CONFIG_XTENSA_VARIANT_NAME)) +variant-y := $(CONFIG_XTENSA_VARIANT_NAME) VARIANT = $(variant-y) diff --git a/arch/xtensa/boot/dts/Makefile b/arch/xtensa/boot/dts/Makefile index 0b8d00cdae7c..720628c0d8b9 100644 --- a/arch/xtensa/boot/dts/Makefile +++ b/arch/xtensa/boot/dts/Makefile @@ -7,10 +7,7 @@ # # -BUILTIN_DTB_SOURCE := $(patsubst "%",%,$(CONFIG_BUILTIN_DTB_SOURCE)).dtb.o -ifneq ($(CONFIG_BUILTIN_DTB_SOURCE),"") -obj-$(CONFIG_OF) += $(BUILTIN_DTB_SOURCE) -endif +obj-$(CONFIG_OF) += $(addsuffix .dtb.o, $(CONFIG_BUILTIN_DTB_SOURCE)) # for CONFIG_OF_ALL_DTBS test dtstree := $(srctree)/$(src) |