1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
|
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <asm/asm-prototypes.h>
#include <asm/dbell.h>
#include <asm/kvm_ppc.h>
#include <asm/ppc-opcode.h>
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
static void __start_timing(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
u64 tb = mftb() - vc->tb_offset_applied;
vcpu->arch.cur_activity = next;
vcpu->arch.cur_tb_start = tb;
}
static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
struct kvmhv_tb_accumulator *curr;
u64 tb = mftb() - vc->tb_offset_applied;
u64 prev_tb;
u64 delta;
u64 seq;
curr = vcpu->arch.cur_activity;
vcpu->arch.cur_activity = next;
prev_tb = vcpu->arch.cur_tb_start;
vcpu->arch.cur_tb_start = tb;
if (!curr)
return;
delta = tb - prev_tb;
seq = curr->seqcount;
curr->seqcount = seq + 1;
smp_wmb();
curr->tb_total += delta;
if (seq == 0 || delta < curr->tb_min)
curr->tb_min = delta;
if (delta > curr->tb_max)
curr->tb_max = delta;
smp_wmb();
curr->seqcount = seq + 2;
}
#define start_timing(vcpu, next) __start_timing(vcpu, next)
#define end_timing(vcpu) __start_timing(vcpu, NULL)
#define accumulate_time(vcpu, next) __accumulate_time(vcpu, next)
#else
#define start_timing(vcpu, next) do {} while (0)
#define end_timing(vcpu) do {} while (0)
#define accumulate_time(vcpu, next) do {} while (0)
#endif
static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev)
{
asm volatile("slbmfev %0,%1" : "=r" (*slbev) : "r" (idx));
asm volatile("slbmfee %0,%1" : "=r" (*slbee) : "r" (idx));
}
static inline void mtslb(u64 slbee, u64 slbev)
{
asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee));
}
static inline void clear_slb_entry(unsigned int idx)
{
mtslb(idx, 0);
}
static inline void slb_clear_invalidate_partition(void)
{
clear_slb_entry(0);
asm volatile(PPC_SLBIA(6));
}
/*
* Malicious or buggy radix guests may have inserted SLB entries
* (only 0..3 because radix always runs with UPRT=1), so these must
* be cleared here to avoid side-channels. slbmte is used rather
* than slbia, as it won't clear cached translations.
*/
static void radix_clear_slb(void)
{
int i;
for (i = 0; i < 4; i++)
clear_slb_entry(i);
}
static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
{
struct kvm_nested_guest *nested = vcpu->arch.nested;
u32 lpid;
lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
/*
* All the isync()s are overkill but trivially follow the ISA
* requirements. Some can likely be replaced with justification
* comment for why they are not needed.
*/
isync();
mtspr(SPRN_LPID, lpid);
isync();
mtspr(SPRN_LPCR, lpcr);
isync();
mtspr(SPRN_PID, vcpu->arch.pid);
isync();
}
static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
{
u32 lpid;
int i;
lpid = kvm->arch.lpid;
mtspr(SPRN_LPID, lpid);
mtspr(SPRN_LPCR, lpcr);
mtspr(SPRN_PID, vcpu->arch.pid);
for (i = 0; i < vcpu->arch.slb_max; i++)
mtslb(vcpu->arch.slb[i].orige, vcpu->arch.slb[i].origv);
isync();
}
static void switch_mmu_to_host_radix(struct kvm *kvm, u32 pid)
{
isync();
mtspr(SPRN_PID, pid);
isync();
mtspr(SPRN_LPID, kvm->arch.host_lpid);
isync();
mtspr(SPRN_LPCR, kvm->arch.host_lpcr);
isync();
}
static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (kvm_is_radix(kvm)) {
radix_clear_slb();
} else {
int i;
int nr = 0;
/*
* This must run before switching to host (radix host can't
* access all SLBs).
*/
for (i = 0; i < vcpu->arch.slb_nr; i++) {
u64 slbee, slbev;
mfslb(i, &slbee, &slbev);
if (slbee & SLB_ESID_V) {
vcpu->arch.slb[nr].orige = slbee | i;
vcpu->arch.slb[nr].origv = slbev;
nr++;
}
}
vcpu->arch.slb_max = nr;
slb_clear_invalidate_partition();
}
}
int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *nested = vcpu->arch.nested;
struct kvmppc_vcore *vc = vcpu->arch.vcore;
s64 hdec;
u64 tb, purr, spurr;
u64 *exsave;
bool ri_set;
int trap;
unsigned long msr;
unsigned long host_hfscr;
unsigned long host_ciabr;
unsigned long host_dawr0;
unsigned long host_dawrx0;
unsigned long host_psscr;
unsigned long host_pidr;
unsigned long host_dawr1;
unsigned long host_dawrx1;
hdec = time_limit - mftb();
if (hdec < 0)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV);
WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME));
start_timing(vcpu, &vcpu->arch.rm_entry);
vcpu->arch.ceded = 0;
if (vc->tb_offset) {
u64 new_tb = mftb() + vc->tb_offset;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xffffff) < (new_tb & 0xffffff))
mtspr(SPRN_TBU40, new_tb + 0x1000000);
vc->tb_offset_applied = vc->tb_offset;
}
msr = mfmsr();
host_hfscr = mfspr(SPRN_HFSCR);
host_ciabr = mfspr(SPRN_CIABR);
host_dawr0 = mfspr(SPRN_DAWR0);
host_dawrx0 = mfspr(SPRN_DAWRX0);
host_psscr = mfspr(SPRN_PSSCR);
host_pidr = mfspr(SPRN_PID);
if (cpu_has_feature(CPU_FTR_DAWR1)) {
host_dawr1 = mfspr(SPRN_DAWR1);
host_dawrx1 = mfspr(SPRN_DAWRX1);
}
if (vc->pcr)
mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
mtspr(SPRN_DPDES, vc->dpdes);
mtspr(SPRN_VTB, vc->vtb);
local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, vcpu->arch.purr);
mtspr(SPRN_SPURR, vcpu->arch.spurr);
if (dawr_enabled()) {
mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
if (cpu_has_feature(CPU_FTR_DAWR1)) {
mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
}
}
mtspr(SPRN_CIABR, vcpu->arch.ciabr);
mtspr(SPRN_IC, vcpu->arch.ic);
mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
(local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
mtspr(SPRN_HSRR0, vcpu->arch.regs.nip);
mtspr(SPRN_HSRR1, (vcpu->arch.shregs.msr & ~MSR_HV) | MSR_ME);
/*
* On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage
* Interrupt (HDSI) the HDSISR is not be updated at all.
*
* To work around this we put a canary value into the HDSISR before
* returning to a guest and then check for this canary when we take a
* HDSI. If we find the canary on a HDSI, we know the hardware didn't
* update the HDSISR. In this case we return to the guest to retake the
* HDSI which should correctly update the HDSISR the second time HDSI
* entry.
*
* Just do this on all p9 processors for now.
*/
mtspr(SPRN_HDSISR, HDSISR_CANARY);
mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
mtspr(SPRN_AMOR, ~0UL);
local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_FAST;
if (kvm_is_radix(kvm)) {
if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
__mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0);
switch_mmu_to_guest_radix(kvm, vcpu, lpcr);
if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
__mtmsrd(0, 1); /* clear RI */
} else {
__mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0);
switch_mmu_to_guest_hpt(kvm, vcpu, lpcr);
}
/* TLBIEL uses LPID=LPIDR, so run this after setting guest LPID */
kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested);
/*
* P9 suppresses the HDEC exception when LPCR[HDICE] = 0,
* so set guest LPCR (with HDICE) before writing HDEC.
*/
mtspr(SPRN_HDEC, hdec);
mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
accumulate_time(vcpu, &vcpu->arch.guest_time);
kvmppc_p9_enter_guest(vcpu);
accumulate_time(vcpu, &vcpu->arch.rm_intr);
/* XXX: Could get these from r11/12 and paca exsave instead */
vcpu->arch.shregs.srr0 = mfspr(SPRN_SRR0);
vcpu->arch.shregs.srr1 = mfspr(SPRN_SRR1);
vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
/* 0x2 bit for HSRR is only used by PR and P7/8 HV paths, clear it */
trap = local_paca->kvm_hstate.scratch0 & ~0x2;
/* HSRR interrupts leave MSR[RI] unchanged, SRR interrupts clear it. */
ri_set = false;
if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK)) {
if (trap != BOOK3S_INTERRUPT_SYSCALL &&
(vcpu->arch.shregs.msr & MSR_RI))
ri_set = true;
exsave = local_paca->exgen;
} else if (trap == BOOK3S_INTERRUPT_SYSTEM_RESET) {
exsave = local_paca->exnmi;
} else { /* trap == 0x200 */
exsave = local_paca->exmc;
}
vcpu->arch.regs.gpr[1] = local_paca->kvm_hstate.scratch1;
vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2;
/*
* Only set RI after reading machine check regs (DAR, DSISR, SRR0/1)
* and hstate scratch (which we need to move into exsave to make
* re-entrant vs SRESET/MCE)
*/
if (ri_set) {
if (unlikely(!(mfmsr() & MSR_RI))) {
__mtmsrd(MSR_RI, 1);
WARN_ON_ONCE(1);
}
} else {
WARN_ON_ONCE(mfmsr() & MSR_RI);
__mtmsrd(MSR_RI, 1);
}
vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)];
vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)];
vcpu->arch.regs.gpr[11] = exsave[EX_R11/sizeof(u64)];
vcpu->arch.regs.gpr[12] = exsave[EX_R12/sizeof(u64)];
vcpu->arch.regs.gpr[13] = exsave[EX_R13/sizeof(u64)];
vcpu->arch.ppr = exsave[EX_PPR/sizeof(u64)];
vcpu->arch.cfar = exsave[EX_CFAR/sizeof(u64)];
vcpu->arch.regs.ctr = exsave[EX_CTR/sizeof(u64)];
vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
if (unlikely(trap == BOOK3S_INTERRUPT_MACHINE_CHECK)) {
vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
kvmppc_realmode_machine_check(vcpu);
} else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) {
kvmppc_realmode_hmi_handler();
} else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) {
vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
} else if (trap == BOOK3S_INTERRUPT_H_DATA_STORAGE) {
vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
} else if (trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
} else if (trap == BOOK3S_INTERRUPT_H_FAC_UNAVAIL) {
vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* Softpatch interrupt for transactional memory emulation cases
* on POWER9 DD2.2. This is early in the guest exit path - we
* haven't saved registers or done a treclaim yet.
*/
} else if (trap == BOOK3S_INTERRUPT_HV_SOFTPATCH) {
vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
/*
* The cases we want to handle here are those where the guest
* is in real suspend mode and is trying to transition to
* transactional mode.
*/
if (local_paca->kvm_hstate.fake_suspend &&
(vcpu->arch.shregs.msr & MSR_TS_S)) {
if (kvmhv_p9_tm_emulation_early(vcpu)) {
/* Prevent it being handled again. */
trap = 0;
}
}
#endif
}
accumulate_time(vcpu, &vcpu->arch.rm_exit);
/* Advance host PURR/SPURR by the amount used by guest */
purr = mfspr(SPRN_PURR);
spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
purr - vcpu->arch.purr);
mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
spurr - vcpu->arch.spurr);
vcpu->arch.purr = purr;
vcpu->arch.spurr = spurr;
vcpu->arch.ic = mfspr(SPRN_IC);
vcpu->arch.pid = mfspr(SPRN_PID);
vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
/* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
mtspr(SPRN_PSSCR, host_psscr |
(local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, host_hfscr);
mtspr(SPRN_CIABR, host_ciabr);
mtspr(SPRN_DAWR0, host_dawr0);
mtspr(SPRN_DAWRX0, host_dawrx0);
if (cpu_has_feature(CPU_FTR_DAWR1)) {
mtspr(SPRN_DAWR1, host_dawr1);
mtspr(SPRN_DAWRX1, host_dawrx1);
}
if (kvm_is_radix(kvm)) {
/*
* Since this is radix, do a eieio; tlbsync; ptesync sequence
* in case we interrupted the guest between a tlbie and a
* ptesync.
*/
asm volatile("eieio; tlbsync; ptesync");
}
/*
* cp_abort is required if the processor supports local copy-paste
* to clear the copy buffer that was under control of the guest.
*/
if (cpu_has_feature(CPU_FTR_ARCH_31))
asm volatile(PPC_CP_ABORT);
vc->dpdes = mfspr(SPRN_DPDES);
vc->vtb = mfspr(SPRN_VTB);
mtspr(SPRN_DPDES, 0);
if (vc->pcr)
mtspr(SPRN_PCR, PCR_MASK);
if (vc->tb_offset_applied) {
u64 new_tb = mftb() - vc->tb_offset_applied;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xffffff) < (new_tb & 0xffffff))
mtspr(SPRN_TBU40, new_tb + 0x1000000);
vc->tb_offset_applied = 0;
}
mtspr(SPRN_HDEC, 0x7fffffff);
save_clear_guest_mmu(kvm, vcpu);
switch_mmu_to_host_radix(kvm, host_pidr);
local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE;
/*
* If we are in real mode, only switch MMU on after the MMU is
* switched to host, to avoid the P9_RADIX_PREFETCH_BUG.
*/
__mtmsrd(msr, 0);
end_timing(vcpu);
return trap;
}
EXPORT_SYMBOL_GPL(kvmhv_vcpu_entry_p9);
|