diff options
Diffstat (limited to 'arch/arm64/kvm/at.c')
-rw-r--r-- | arch/arm64/kvm/at.c | 1101 |
1 files changed, 1101 insertions, 0 deletions
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c new file mode 100644 index 000000000000..39f0e87a340e --- /dev/null +++ b/arch/arm64/kvm/at.c @@ -0,0 +1,1101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Linaro Ltd + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm_host.h> + +#include <asm/esr.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +enum trans_regime { + TR_EL10, + TR_EL20, + TR_EL2, +}; + +struct s1_walk_info { + u64 baddr; + enum trans_regime regime; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int txsz; + int sl; + bool hpd; + bool be; + bool s2; +}; + +struct s1_walk_result { + union { + struct { + u64 desc; + u64 pa; + s8 level; + u8 APTable; + bool UXNTable; + bool PXNTable; + }; + struct { + u8 fst; + bool ptw; + bool s2; + }; + }; + bool failed; +}; + +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +{ + wr->fst = fst; + wr->ptw = ptw; + wr->s2 = s2; + wr->failed = true; +} + +#define S1_MMU_DISABLED (-127) + +static int get_ia_size(struct s1_walk_info *wi) +{ + return 64 - wi->txsz; +} + +/* Return true if the IPA is out of the OA range */ +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) +{ + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); +} + +/* Return the translation regime that applies to an AT instruction */ +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) +{ + /* + * We only get here from guest EL2, so the translation + * regime AT applies to is solely defined by {E2H,TGE}. + */ + switch (op) { + case OP_AT_S1E2R: + case OP_AT_S1E2W: + case OP_AT_S1E2A: + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + break; + default: + return (vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; + } +} + +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + unsigned int stride, x; + bool va55, tbi, lva, as_el0; + + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + + wi->regime = compute_translation_regime(vcpu, op); + as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + + va55 = va & BIT(55); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + + switch (wi->regime) { + case TR_EL10: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL2: + case TR_EL20: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + tbi = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_TBI, tcr) : + (va55 ? + FIELD_GET(TCR_TBI1, tcr) : + FIELD_GET(TCR_TBI0, tcr))); + + if (!tbi && (u64)sign_extend64(va, 55) != va) + goto addrsz; + + va = (u64)sign_extend64(va, 55); + + /* Let's put the MMU disabled case aside immediately */ + switch (wi->regime) { + case TR_EL10: + /* + * If dealing with the EL1&0 translation regime, 3 things + * can disable the S1 translation: + * + * - HCR_EL2.DC = 1 + * - HCR_EL2.{E2H,TGE} = {0,1} + * - SCTLR_EL1.M = 0 + * + * The TGE part is interesting. If we have decided that this + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or + * {0,x}, and we only need to test for TGE == 1. + */ + if (hcr & (HCR_DC | HCR_TGE)) { + wr->level = S1_MMU_DISABLED; + break; + } + fallthrough; + case TR_EL2: + case TR_EL20: + if (!(sctlr & SCTLR_ELx_M)) + wr->level = S1_MMU_DISABLED; + break; + } + + if (wr->level == S1_MMU_DISABLED) { + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) + goto addrsz; + + wr->pa = va; + return 0; + } + + wi->be = sctlr & SCTLR_ELx_EE; + + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); + wi->hpd &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HPD, tcr) : + (va55 ? + FIELD_GET(TCR_HPD1, tcr) : + FIELD_GET(TCR_HPD0, tcr))); + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + /* R_PLCGL, R_YXNYW */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { + if (wi->txsz > 39) + goto transfault_l0; + } else { + if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) + goto transfault_l0; + } + + /* R_GTJBY, R_SXWGM */ + switch (BIT(wi->pgshift)) { + case SZ_4K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_16K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_64K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); + break; + } + + if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) + goto transfault_l0; + + ia_bits = get_ia_size(wi); + + /* R_YYVYV, I_THCZK */ + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || + (va55 && va < GENMASK(63, ia_bits))) + goto transfault_l0; + + /* I_ZFSYQ */ + if (wi->regime != TR_EL2 && + (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) + goto transfault_l0; + + /* R_BNDVG and following statements */ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && + as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault_l0; + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + ps = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); + + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + + /* Compute minimal alignment */ + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); + + wi->baddr = ttbr & TTBRx_EL1_BADDR; + + /* R_VPBBF */ + if (check_output_size(wi->baddr, wi)) + goto addrsz; + + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + + return 0; + +addrsz: /* Address Size Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); + return -EFAULT; + +transfault_l0: /* Translation Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); + return -EFAULT; +} + +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 va_top, va_bottom, baddr, desc; + int level, stride, ret; + + level = wi->sl; + stride = wi->pgshift - 3; + baddr = wi->baddr; + + va_top = get_ia_size(wi) - 1; + + while (1) { + u64 index, ipa; + + va_bottom = (3 - level) * stride + wi->pgshift; + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); + + ipa = baddr | index; + + if (wi->s2) { + struct kvm_s2_trans s2_trans = {}; + + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret) { + fail_s1_walk(wr, + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, + true, true); + return ret; + } + + if (!kvm_s2_trans_readable(&s2_trans)) { + fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), + true, true); + + return -EPERM; + } + + ipa = kvm_s2_trans_output(&s2_trans); + } + + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), + true, false); + return ret; + } + + if (wi->be) + desc = be64_to_cpu((__force __be64)desc); + else + desc = le64_to_cpu((__force __le64)desc); + + /* Invalid descriptor */ + if (!(desc & BIT(0))) + goto transfault; + + /* Block mapping, check validity down the line */ + if (!(desc & BIT(1))) + break; + + /* Page mapping */ + if (level == 3) + break; + + /* Table handling */ + if (!wi->hpd) { + wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); + } + + baddr = desc & GENMASK_ULL(47, wi->pgshift); + + /* Check for out-of-range OA */ + if (check_output_size(baddr, wi)) + goto addrsz; + + /* Prepare for next round */ + va_top = va_bottom - 1; + level++; + } + + /* Block mapping, check the validity of the level */ + if (!(desc & BIT(1))) { + bool valid_block = false; + + switch (BIT(wi->pgshift)) { + case SZ_4K: + valid_block = level == 1 || level == 2; + break; + case SZ_16K: + case SZ_64K: + valid_block = level == 2; + break; + } + + if (!valid_block) + goto transfault; + } + + if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + goto addrsz; + + va_bottom += contiguous_bit_shift(desc, wi, level); + + wr->failed = false; + wr->level = level; + wr->desc = desc; + wr->pa = desc & GENMASK(47, va_bottom); + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + + return 0; + +addrsz: + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + return -EINVAL; +transfault: + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + return -ENOENT; +} + +struct mmu_config { + u64 ttbr0; + u64 ttbr1; + u64 tcr; + u64 mair; + u64 sctlr; + u64 vttbr; + u64 vtcr; + u64 hcr; +}; + +static void __mmu_config_save(struct mmu_config *config) +{ + config->ttbr0 = read_sysreg_el1(SYS_TTBR0); + config->ttbr1 = read_sysreg_el1(SYS_TTBR1); + config->tcr = read_sysreg_el1(SYS_TCR); + config->mair = read_sysreg_el1(SYS_MAIR); + config->sctlr = read_sysreg_el1(SYS_SCTLR); + config->vttbr = read_sysreg(vttbr_el2); + config->vtcr = read_sysreg(vtcr_el2); + config->hcr = read_sysreg(hcr_el2); +} + +static void __mmu_config_restore(struct mmu_config *config) +{ + write_sysreg(config->hcr, hcr_el2); + + /* + * ARM errata 1165522 and 1530923 require TGE to be 1 before + * we update the guest state. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + write_sysreg_el1(config->ttbr0, SYS_TTBR0); + write_sysreg_el1(config->ttbr1, SYS_TTBR1); + write_sysreg_el1(config->tcr, SYS_TCR); + write_sysreg_el1(config->mair, SYS_MAIR); + write_sysreg_el1(config->sctlr, SYS_SCTLR); + write_sysreg(config->vttbr, vttbr_el2); + write_sysreg(config->vtcr, vtcr_el2); +} + +static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 host_pan; + bool fail; + + host_pan = read_sysreg_s(SYS_PSTATE_PAN); + write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); + + switch (op) { + case OP_AT_S1E1RP: + fail = __kvm_at(OP_AT_S1E1RP, vaddr); + break; + case OP_AT_S1E1WP: + fail = __kvm_at(OP_AT_S1E1WP, vaddr); + break; + } + + write_sysreg_s(host_pan, SYS_PSTATE_PAN); + + return fail; +} + +#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) +#define MEMATTR_NC 0b0100 +#define MEMATTR_Wt 0b1000 +#define MEMATTR_Wb 0b1100 +#define MEMATTR_WbRaWa 0b1111 + +#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) + +static u8 s2_memattr_to_attr(u8 memattr) +{ + memattr &= 0b1111; + + switch (memattr) { + case 0b0000: + case 0b0001: + case 0b0010: + case 0b0011: + return memattr << 2; + case 0b0100: + return MEMATTR(Wb, Wb); + case 0b0101: + return MEMATTR(NC, NC); + case 0b0110: + return MEMATTR(Wt, NC); + case 0b0111: + return MEMATTR(Wb, NC); + case 0b1000: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1001: + return MEMATTR(NC, Wt); + case 0b1010: + return MEMATTR(Wt, Wt); + case 0b1011: + return MEMATTR(Wb, Wt); + case 0b1100: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1101: + return MEMATTR(NC, Wb); + case 0b1110: + return MEMATTR(Wt, Wb); + case 0b1111: + return MEMATTR(Wb, Wb); + default: + unreachable(); + } +} + +static u8 combine_s1_s2_attr(u8 s1, u8 s2) +{ + bool transient; + u8 final = 0; + + /* Upgrade transient s1 to non-transient to simplify things */ + switch (s1) { + case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ + transient = true; + s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); + break; + case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ + transient = true; + s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); + break; + default: + transient = false; + } + + /* S2CombineS1AttrHints() */ + if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || + (s2 & GENMASK(3, 2)) == MEMATTR_NC) + final = MEMATTR_NC; + else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || + (s2 & GENMASK(3, 2)) == MEMATTR_Wt) + final = MEMATTR_Wt; + else + final = MEMATTR_Wb; + + if (final != MEMATTR_NC) { + /* Inherit RaWa hints form S1 */ + if (transient) { + switch (s1 & GENMASK(3, 2)) { + case MEMATTR_Wt: + final = 0; + break; + case MEMATTR_Wb: + final = MEMATTR_NC; + break; + } + } + + final |= s1 & GENMASK(1, 0); + } + + return final; +} + +#define ATTR_NSH 0b00 +#define ATTR_RSV 0b01 +#define ATTR_OSH 0b10 +#define ATTR_ISH 0b11 + +static u8 compute_sh(u8 attr, u64 desc) +{ + u8 sh; + + /* Any form of device, as well as NC has SH[1:0]=0b10 */ + if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) + return ATTR_OSH; + + sh = FIELD_GET(PTE_SHARED, desc); + if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ + sh = ATTR_NSH; + + return sh; +} + +static u8 combine_sh(u8 s1_sh, u8 s2_sh) +{ + if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) + return ATTR_OSH; + if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) + return ATTR_ISH; + + return ATTR_NSH; +} + +static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, + struct kvm_s2_trans *tr) +{ + u8 s1_parattr, s2_memattr, final_attr; + u64 par; + + /* If S2 has failed to translate, report the damage */ + if (tr->esr) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= SYS_PAR_EL1_S; + par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); + return par; + } + + s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); + s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) + s2_memattr &= ~BIT(3); + + /* Combination of R_VRJSW and R_RHWZM */ + switch (s2_memattr) { + case 0b0101: + if (MEMATTR_IS_DEVICE(s1_parattr)) + final_attr = s1_parattr; + else + final_attr = MEMATTR(NC, NC); + break; + case 0b0110: + case 0b1110: + final_attr = MEMATTR(WbRaWa, WbRaWa); + break; + case 0b0111: + case 0b1111: + /* Preserve S1 attribute */ + final_attr = s1_parattr; + break; + case 0b0100: + case 0b1100: + case 0b1101: + /* Reserved, do something non-silly */ + final_attr = s1_parattr; + break; + default: + /* MemAttr[2]=0, Device from S2 */ + final_attr = s2_memattr & GENMASK(1,0) << 2; + } + } else { + /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ + u8 s2_parattr = s2_memattr_to_attr(s2_memattr); + + if (MEMATTR_IS_DEVICE(s1_parattr) || + MEMATTR_IS_DEVICE(s2_parattr)) { + final_attr = min(s1_parattr, s2_parattr); + } else { + /* At this stage, this is memory vs memory */ + final_attr = combine_s1_s2_attr(s1_parattr & 0xf, + s2_parattr & 0xf); + final_attr |= combine_s1_s2_attr(s1_parattr >> 4, + s2_parattr >> 4) << 4; + } + } + + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && + !MEMATTR_IS_DEVICE(final_attr)) + final_attr = MEMATTR(NC, NC); + + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); + par |= tr->output & GENMASK(47, 12); + par |= FIELD_PREP(SYS_PAR_EL1_SH, + combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), + compute_sh(final_attr, tr->desc))); + + return par; +} + +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, + enum trans_regime regime) +{ + u64 par; + + if (wr->failed) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; + par |= wr->s2 ? SYS_PAR_EL1_S : 0; + } else if (wr->level == S1_MMU_DISABLED) { + /* MMU off or HCR_EL2.DC == 1 */ + par = SYS_PAR_EL1_NSE; + par |= wr->pa & GENMASK_ULL(47, 12); + + if (regime == TR_EL10 && + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, + MEMATTR(WbRaWa, WbRaWa)); + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); + } else { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); + } + } else { + u64 mair, sctlr; + u8 sh; + + par = SYS_PAR_EL1_NSE; + + mair = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, MAIR_EL1) : + vcpu_read_sys_reg(vcpu, MAIR_EL2)); + + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; + mair &= 0xff; + + sctlr = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, SCTLR_EL1) : + vcpu_read_sys_reg(vcpu, SCTLR_EL2)); + + /* Force NC for memory if SCTLR_ELx.C is clear */ + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) + mair = MEMATTR(NC, NC); + + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); + par |= wr->pa & GENMASK_ULL(47, 12); + + sh = compute_sh(mair, wr->desc); + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); + } + + return par; +} + +static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + u64 sctlr; + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + return false; + + if (regime == TR_EL10) + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + else + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + + return sctlr & SCTLR_EL1_EPAN; +} + +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + bool perm_fail, ur, uw, ux, pr, pw, px; + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + int ret, idx; + + ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (ret) + goto compute_par; + + /* FIXME: revisit when adding indirect permission support */ + /* AArch64.S1DirectBasePermissions() */ + if (wi.regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) { + case 0b00: + pr = pw = true; + ur = uw = false; + break; + case 0b01: + pr = pw = ur = uw = true; + break; + case 0b10: + pr = true; + pw = ur = uw = false; + break; + case 0b11: + pr = ur = true; + pw = uw = false; + break; + } + + switch (wr.APTable) { + case 0b00: + break; + case 0b01: + ur = uw = false; + break; + case 0b10: + pw = uw = false; + break; + case 0b11: + pw = ur = uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw); + ux = !((wr.desc & PTE_UXN) || wr.UXNTable); + + if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { + bool pan; + + pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; + pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux); + pw &= !pan; + pr &= !pan; + } + } else { + ur = uw = ux = false; + + if (!(wr.desc & PTE_RDONLY)) { + pr = pw = true; + } else { + pr = true; + pw = false; + } + + if (wr.APTable & BIT(1)) + pw = false; + + /* XN maps to UXN */ + px = !((wr.desc & PTE_UXN) || wr.UXNTable); + } + + perm_fail = false; + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1R: + case OP_AT_S1E2R: + perm_fail = !pr; + break; + case OP_AT_S1E1WP: + case OP_AT_S1E1W: + case OP_AT_S1E2W: + perm_fail = !pw; + break; + case OP_AT_S1E0R: + perm_fail = !ur; + break; + case OP_AT_S1E0W: + perm_fail = !uw; + break; + case OP_AT_S1E1A: + case OP_AT_S1E2A: + break; + default: + BUG(); + } + + if (perm_fail) + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + +compute_par: + return compute_par_s1(vcpu, &wr, wi.regime); +} + +/* + * Return the PAR_EL1 value as the result of a valid translation. + * + * If the translation is unsuccessful, the value may only contain + * PAR_EL1.F, and cannot be taken at face value. It isn't an + * indication of the translation having failed, only that the fast + * path did not succeed, *unless* it indicates a S1 permission fault. + */ +static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct mmu_config config; + struct kvm_s2_mmu *mmu; + bool fail; + u64 par; + + par = SYS_PAR_EL1_F; + + /* + * We've trapped, so everything is live on the CPU. As we will + * be switching contexts behind everybody's back, disable + * interrupts while holding the mmu lock. + */ + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); + + /* + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already + * the right one (as we trapped from vEL2). If not, save the + * full MMU context. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + goto skip_mmu_switch; + + /* + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not + * find it (recycled by another vcpu, for example). When this + * happens, admit defeat immediately and use the SW (slow) path. + */ + mmu = lookup_s2_mmu(vcpu); + if (!mmu) + return par; + + __mmu_config_save(&config); + + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); + __load_stage2(mmu, mmu->arch); + +skip_mmu_switch: + /* Clear TGE, enable S2 translation, we're rolling */ + write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + isb(); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1WP: + fail = at_s1e1p_fast(vcpu, op, vaddr); + break; + case OP_AT_S1E1R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E1W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E0R: + fail = __kvm_at(OP_AT_S1E0R, vaddr); + break; + case OP_AT_S1E0W: + fail = __kvm_at(OP_AT_S1E0W, vaddr); + break; + case OP_AT_S1E1A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + break; + } + + if (!fail) + par = read_sysreg_par(); + + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + __mmu_config_restore(&config); + + return par; +} + +static bool par_check_s1_perm_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && + !(par & SYS_PAR_EL1_S)); +} + +void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + + /* + * If PAR_EL1 reports that AT failed on a S1 permission fault, we + * know for sure that the PTW was able to walk the S1 tables and + * there's nothing else to do. + * + * If AT failed for any other reason, then we must walk the guest S1 + * to emulate the instruction. + */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par; + + /* + * We've trapped, so everything is live on the CPU. As we will be + * switching context behind everybody's back, disable interrupts... + */ + scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { + struct kvm_s2_mmu *mmu; + u64 val, hcr; + bool fail; + + mmu = &vcpu->kvm->arch.mmu; + + val = hcr = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + val |= HCR_VM; + + if (!vcpu_el2_e2h_is_set(vcpu)) + val |= HCR_NV | HCR_NV1; + + write_sysreg(val, hcr_el2); + isb(); + + par = SYS_PAR_EL1_F; + + switch (op) { + case OP_AT_S1E2R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E2W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E2A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + } + + isb(); + + if (!fail) + par = read_sysreg_par(); + + write_sysreg(hcr, hcr_el2); + isb(); + } + + /* We failed the translation, let's replay it in slow motion */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct kvm_s2_trans out = {}; + u64 ipa, par; + bool write; + int ret; + + /* Do the stage-1 translation */ + switch (op) { + case OP_AT_S12E1R: + op = OP_AT_S1E1R; + write = false; + break; + case OP_AT_S12E1W: + op = OP_AT_S1E1W; + write = true; + break; + case OP_AT_S12E0R: + op = OP_AT_S1E0R; + write = false; + break; + case OP_AT_S12E0W: + op = OP_AT_S1E0W; + write = true; + break; + default: + WARN_ON_ONCE(1); + return; + } + + __kvm_at_s1e01(vcpu, op, vaddr); + par = vcpu_read_sys_reg(vcpu, PAR_EL1); + if (par & SYS_PAR_EL1_F) + return; + + /* + * If we only have a single stage of translation (E2H=0 or + * TGE=1), exit early. Same thing if {VM,DC}=={0,0}. + */ + if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) || + !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) + return; + + /* Do the stage-2 translation */ + ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); + out.esr = 0; + ret = kvm_walk_nested_s2(vcpu, ipa, &out); + if (ret < 0) + return; + + /* Check the access permission */ + if (!out.esr && + ((!write && !out.readable) || (write && !out.writable))) + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); + + par = compute_par_s12(vcpu, par, &out); + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} |