diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 26b8c08e2fc404439b10934e1afd3a03abd6e6a6..99adf2fba84e9b65fdbf42cd30bb3a473ac62711 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -19,3 +19,7 @@ config AS_TPAUSE def_bool $(as-instr,tpause %ecx) help Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7 +config AS_PAUSEOPT + def_bool $(as-instr,pauseopt) + help + Supported by binutils >= xxx-TBD and LLVM integrated assembler xxx-TBD diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c6cf63569dc10c9df9dfdf9d1181a75a682bf43f..e329ec7b06ff8134f2af0cfc8adcc3673202f2e6 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -158,7 +158,7 @@ #define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ #define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ #define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ -#define X86_FEATURE_ZX_FMA ( 5*32+15) /* FMA supported */ +#define X86_FEATURE_PAUSEOPT (5*32 + 15) /* PAUSEOPT instruction is present*/ #define X86_FEATURE_PARALLAX ( 5*32+16) /* Adaptive P-state control present */ #define X86_FEATURE_PARALLAX_EN ( 5*32+17) /* Adaptive P-state control enabled */ #define X86_FEATURE_OVERSTRESS ( 5*32+18) /* Overstress for auto overclock present */ diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 630891d2581989e4e2058a18c5790545f412d73b..c844077f19b6f7bb0e3b81bc790a5fc0e93d4a1d 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -7,6 +7,7 @@ void __init use_tsc_delay(void); void __init use_tpause_delay(void); +void __init use_pauseopt_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d96eea53f89a4fa34d17a8a307884910a520694a..543ff7f82a22bf9e1f18d1891eea547598d914ef 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -846,6 +846,16 @@ struct kvm_vcpu_arch { /* Protected Guests */ bool guest_state_protected; + /* + * Zhaoxin/Centaur extended software managed vcpu states. + * - pauseopt_interrupted: set when pauseopt optimized state interrupted + * by some vmexit. + * - pauseopt_rip: stores the guest RIP at the time of vmexit if the vmexit + * occurred during pauseopt optimized state. + * We will move these definitions to zhaoxin specific arch in the future. + */ + bool pauseopt_interrupted; + unsigned long pauseopt_rip; }; struct kvm_lpage_info { diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f52cbc4702a076e0a0611b7dad791514d19ee776..fabd87b324ecff380823a3310d137ac1c2d71591 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -85,6 +85,13 @@ #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5 #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT) +#define MSR_PAUSEOPT_CONTROL 0x187f +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_PAUSEOPT_CONTROL_TIME_MASK (~0x03U) + #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) @@ -718,6 +725,13 @@ #define MSR_VIA_RNG 0x0000110b #define MSR_VIA_BCR2 0x00001147 +/* + * Zhaoxin extend VMCS capabilities: + * bit 0: exec-cntl3 VMCS field. + */ +#define MSR_ZX_EXT_VMCS_CAPS 0x1675 +#define MSR_ZX_VMCS_EXEC_CTL3 BIT(0) + /* Transmeta defined MSRs */ #define MSR_TMTA_LONGRUN_CTRL 0x80868010 #define MSR_TMTA_LONGRUN_FLAGS 0x80868011 @@ -1069,6 +1083,9 @@ #define MSR_IA32_VMX_VMFUNC 0x00000491 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 +/* Zhaoxin VT MSRs */ +#define MSR_ZX_VMX_PROCBASED_CTLS3 0x12A7 + /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 #define VMX_BASIC_TRUE_CTLS (1ULL << 55) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 29dd27b5a339db16cfb1b40aa68004cf7b2bac0c..1d98132632335e1f6a64f46eeee9f825d38427b6 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -25,6 +25,8 @@ #define TPAUSE_C01_STATE 1 #define TPAUSE_C02_STATE 0 +#define PAUSEOPT_P01_STATE 1 + static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { @@ -140,4 +142,21 @@ static inline void __tpause(u32 ecx, u32 edx, u32 eax) #endif } +/* + * Caller can specify to enter P0.1 (low latency, less power saving). + */ +static inline void __pauseopt(u32 ecx, u32 edx, u32 eax) +{ + /* "pauseopt %ecx, %edx, %eax;" */ + #ifdef CONFIG_AS_PAUSEOPT + asm volatile("pauseopt\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #else + asm volatile(".byte 0xf2, 0x0f, 0xa6, 0xd0\t\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #endif +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 91dbdd6d0746f77f54165bb883e3018ff050b1a0..7cfd540d8d6b4dcb9be2ffad68e0ee31c72fb101 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -229,6 +229,7 @@ enum vmcs_field { ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + PAUSEOPT_TARGET_TSC = 0x00002200, PASID_DIR0 = 0x00002038, PASID_DIR0_HIGH = 0x00002039, PASID_DIR1 = 0x0000203a, @@ -286,6 +287,7 @@ enum vmcs_field { PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, NOTIFY_WINDOW = 0x00004024, + ZX_TERTIARY_VM_EXEC_CONTROL = 0x00004200, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 4155d24bb20c1242e12c84ba3f535a9c153785f8..9597c777be9fb5a2979cd73055adffc2545f463b 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -89,5 +89,6 @@ #define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */ /* Tertiary Processor-Based VM-Execution Controls, word 3 */ +#define VMX_FEATURE_GUEST_PAUSEOPT (3*32 + 0) /* pauseopt instruction in guest mode */ #define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index e24c42a56248fa229a654cd936a7f7cf2844bc25..917c519e1d61d6a1de5aa6a886d301876e3a1b4c 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -89,6 +89,7 @@ #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 +#define EXIT_REASON_PAUSEOPT 68 #define EXIT_REASON_ENQCMD_PASID 72 #define EXIT_REASON_ENQCMDS_PASID 73 #define EXIT_REASON_BUS_LOCK 74 diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 637b499450d10b4ee8800bd55826923884a2f05a..6c2fcf8cd75f8ed707d04640f0f5a9c6ccd3bd18 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -27,6 +27,7 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o +obj-y += pauseopt.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index ad6982391bc9f43fed4e2627bb2368825383ce4b..fe584faebeb9ec652f1cb7035a58bfde5c3805a7 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -119,6 +119,9 @@ static void early_init_centaur(struct cpuinfo_x86 *c) if (detect_extended_topology_early(c) < 0) detect_ht_early(c); + + if ((cpuid_eax(0xC0000000) >= 0xC0000006) && (cpuid_eax(0xC0000006) & 0x1)) + setup_force_cpu_cap(X86_FEATURE_PAUSEOPT); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 03851240c3e36d4ed5e9ad250eee76410830d6e9..619f85b29fc9592ef9f1a604e21a805a12b45ba0 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -17,11 +17,37 @@ enum vmx_feature_leafs { SECONDARY_CTLS, TERTIARY_CTLS_LOW, TERTIARY_CTLS_HIGH, + ZX_TERTIARY_CTLS, NR_VMX_FEATURE_WORDS, }; #define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f) +static void init_zhaoxin_ext_capabilities(struct cpuinfo_x86 *c) +{ + u32 ext_vmcs_cap = 0; + u32 proc_based_ctls3_high = 0; + u32 ign, msr_high; + int err; + + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN || + boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)) + return; + + err = rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &ext_vmcs_cap, &ign); + + if (!(ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3)) + return; + + err = rdmsr_safe(MSR_ZX_VMX_PROCBASED_CTLS3, &ign, &msr_high); + if (!(msr_high & 0x1)) /* CTLS3 MSR doesn't exist */ + proc_based_ctls3_high = 0x1; /* set PAUSEOPT(bit0) */ + else + proc_based_ctls3_high = msr_high; + + c->vmx_capability[ZX_TERTIARY_CTLS] = proc_based_ctls3_high; +} + static void init_vmx_capabilities(struct cpuinfo_x86 *c) { u32 supported, funcs, ept, vpid, ign, low, high; @@ -97,6 +123,8 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_EPT_AD); if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID)) set_cpu_cap(c, X86_FEATURE_VPID); + + init_zhaoxin_ext_capabilities(c); } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ diff --git a/arch/x86/kernel/cpu/pauseopt.c b/arch/x86/kernel/cpu/pauseopt.c new file mode 100644 index 0000000000000000000000000000000000000000..ca8dfc142608afa0c94475a3bf688fb1959cdd79 --- /dev/null +++ b/arch/x86/kernel/cpu/pauseopt.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +#define PAUSEOPT_CTRL_VAL(max_time) (((max_time) & MSR_PAUSEOPT_CONTROL_TIME_MASK)) + +/* + * Cache PAUSEOPT_CONTROL MSR. This is a systemwide control. By default, + * pauseopt max time is 100000 in TSC-quanta and P0.1 is enabled. + */ +static u32 pauseopt_control_cached = PAUSEOPT_CTRL_VAL(100000); + +/* + * Cache the original PAUSEOPT_CONTROL MSR value which is configured by + * hardware or BIOS before kernel boot. + */ +static u32 orig_pauseopt_control_cached __ro_after_init; + +/* + * Serialize access to pauseopt_control_cached and PAUSEOPT_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(pauseopt_lock); + +static void pauseopt_update_control_msr(void *unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_PAUSEOPT_CONTROL, READ_ONCE(pauseopt_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of pauseopt_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ +static int pauseopt_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + pauseopt_update_control_msr(NULL); + local_irq_enable(); + return 0; +} + +/* + * The CPU hotplug callback sets the control MSR to the original control + * value. + */ +static int pauseopt_cpu_offline(unsigned int cpu) +{ + /* + * This code is protected by the CPU hotplug already and + * orig_pauseopt_control_cached is never changed after it caches + * the original control MSR value in pauseopt_init(). So there + * is no race condition here. + */ + wrmsr(MSR_PAUSEOPT_CONTROL, orig_pauseopt_control_cached, 0); + + return 0; +} + +/* + * On resume, restore PAUSEOPT_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void pauseopt_syscore_resume(void) +{ + pauseopt_update_control_msr(NULL); +} + +static struct syscore_ops pauseopt_syscore_ops = { + .resume = pauseopt_syscore_resume, +}; + +/* sysfs interface */ + +static inline u32 pauseopt_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_PAUSEOPT_CONTROL_TIME_MASK; +} + +static inline void pauseopt_update_control(u32 maxtime) +{ + u32 ctrl = maxtime & MSR_PAUSEOPT_CONTROL_TIME_MASK; + + WRITE_ONCE(pauseopt_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(pauseopt_update_control_msr, NULL, 1); +} + +static ssize_t +enable_p01_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ret; + + if (boot_cpu_has(X86_FEATURE_PAUSEOPT)) + ret = 1; + else + ret = 0; + + return sprintf(buf, "%d\n", ret); +} +static DEVICE_ATTR_RO(enable_p01); + +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(pauseopt_control_cached); + + return sprintf(buf, "%u\n", pauseopt_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_PAUSEOPT_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&pauseopt_lock); + + ctrl = READ_ONCE(pauseopt_control_cached); + if (max_time != pauseopt_ctrl_max_time(ctrl)) + pauseopt_update_control(max_time); + + mutex_unlock(&pauseopt_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + +static struct attribute *pauseopt_attrs[] = { + &dev_attr_enable_p01.attr, + &dev_attr_max_time.attr, + NULL +}; + +static struct attribute_group pauseopt_attr_group = { + .attrs = pauseopt_attrs, + .name = "pauseopt_control", +}; + +static int __init pauseopt_init(void) +{ + struct device *dev; + int ret; + + if (!boot_cpu_has(X86_FEATURE_PAUSEOPT)) + return -ENODEV; + + /* + * Cache the original control MSR value before the control MSR is + * changed. This is the only place where orig_pauseopt_control_cached + * is modified. + */ + rdmsrl(MSR_PAUSEOPT_CONTROL, orig_pauseopt_control_cached); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "pauseopt:online", + pauseopt_cpu_online, pauseopt_cpu_offline); + if (ret < 0) { + /* + * On failure, the control MSR on all CPUs has the + * original control value. + */ + return ret; + } + + register_syscore_ops(&pauseopt_syscore_ops); + + /* + * Add pauseopt control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = cpu_subsys.dev_root; + return sysfs_create_group(&dev->kobj, &pauseopt_attr_group); +} +device_initcall(pauseopt_init); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 23fba69cfa5957a9fe757788dac69195f6e14997..abefe24907d6557696408120aa0478b777f91492 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -75,6 +75,9 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) if (detect_extended_topology_early(c) < 0) detect_ht_early(c); + + if ((cpuid_eax(0xC0000000) >= 0xC0000006) && (cpuid_eax(0xC0000006) & 0x1)) + setup_force_cpu_cap(X86_FEATURE_PAUSEOPT); } static void init_zhaoxin(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index e42faa792c07931083988931a5061b5e21429499..110b5be08208938861b48544cdf45cd99981f3b9 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -102,6 +102,9 @@ static __init void x86_late_time_init(void) if (static_cpu_has(X86_FEATURE_WAITPKG)) use_tpause_delay(); + + if (static_cpu_has(X86_FEATURE_PAUSEOPT)) + use_pauseopt_delay(); } /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 212a4400914a048b8f95e8b69eb40732edeed692..5b557e046e3ac339168be2e12661a12a46d1a421 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -604,6 +604,12 @@ void kvm_set_cpu_caps(void) F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN) ); + + /* + * Do not hide any features supported by this leaf, allow the guest to see + * the original information.Now leaf 0xC000_0006 EAX only supports PAUSEOPT. + */ + kvm_cpu_cap_mask(CPUID_C000_0006_EAX, F(PAUSEOPT)); } EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); @@ -1022,17 +1028,21 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: - /*Just support up to 0xC0000004 now*/ - entry->eax = min(entry->eax, 0xC0000004); + /* Extended to 0xC0000006 */ + entry->eax = min(entry->eax, 0xC0000006); break; case 0xC0000001: cpuid_entry_override(entry, CPUID_C000_0001_EDX); break; + case 0xC0000006: + cpuid_entry_override(entry, CPUID_C000_0006_EAX); + break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 0xC0000002: case 0xC0000003: case 0xC0000004: + case 0xC0000005: default: entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 85407b457efa6f21dea450f88efedcf2c9caaea1..3871122e17a6795941fe263e7eee0c2ed1a0431b 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -93,6 +93,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, + [CPUID_C000_0006_EAX] = {0xc0000006, 0, CPUID_EAX}, [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 113eda3c0bed5a1bcd4553ac27cbb641c30b28ea..328e1dbcc679273d7799aba73658dfe7dfd68e51 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -60,6 +60,7 @@ struct vmcs_config { u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; u64 cpu_based_3rd_exec_ctrl; + u32 zx_cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -139,6 +140,11 @@ static inline bool cpu_has_tertiary_exec_ctrls(void) CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; } +static inline bool cpu_has_zx_tertiary_exec_ctrls(void) +{ + return !!vmcs_config.zx_cpu_based_3rd_exec_ctrl; +} + static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -265,6 +271,12 @@ static inline bool cpu_has_vmx_pasid_trans(void) SECONDARY_EXEC_PASID_TRANSLATION; } +static inline bool cpu_has_vmx_pauseopt(void) +{ + return vmcs_config.zx_cpu_based_3rd_exec_ctrl & + ZX_TERTIARY_EXEC_GUEST_PAUSEOPT; +} + static inline bool cpu_has_vmx_waitpkg(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index a32685f75b6c76058f5945f88383fa27214c9699..6a9e75e4dfd94ef2544274da8c221c0f2646d753 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -49,6 +49,7 @@ struct vmcs_controls_shadow { u32 exec; u32 secondary_exec; u64 tertiary_exec; + u32 zx_tertiary_exec; }; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index da6b0f6dc4033b8c28d497be2217e9f6d6561fdc..09485fe4c0375f6f99fae97771c9d2b62cfabc0e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1945,6 +1945,24 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) } } +static int zx_vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!is_zhaoxin_cpu()) + return KVM_MSR_RET_UNHANDLED; + + switch (msr_info->index) { + case MSR_PAUSEOPT_CONTROL: + if (!msr_info->host_initiated && !vmx_guest_pauseopt_enabled(vmx)) + return 1; + msr_info->data = vmx->msr_pauseopt_control; + return 0; + default: + return KVM_MSR_RET_UNHANDLED; /* Non-zhaoxin MSRs */ + } +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -1955,6 +1973,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; u32 index; + int ret = 0; + + ret = zx_vmx_get_msr(vcpu, msr_info); + switch (ret) { + case 0: + case 1: + return ret; + case KVM_MSR_RET_UNHANDLED: + ret = 0; + break; + } switch (msr_info->index) { #ifdef CONFIG_X86_64 @@ -2126,6 +2155,31 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) return debugctl; } +static int zx_vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 msr_index = msr_info->index; + u64 data = msr_info->data; + + if (!is_zhaoxin_cpu()) + return KVM_MSR_RET_UNHANDLED; + + switch (msr_index) { + case MSR_PAUSEOPT_CONTROL: + if (!msr_info->host_initiated && !vmx_guest_pauseopt_enabled(vmx)) + return 1; + + /* The reserved bit 1 and non-32 bit [63:32] should be zero */ + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) + return 1; + + vmx->msr_pauseopt_control = data; + return 0; + default: + return KVM_MSR_RET_UNHANDLED; /* Non-zhaoxin MSRs*/ + } +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2140,6 +2194,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u64 data = msr_info->data; u32 index; + ret = zx_vmx_set_msr(vcpu, msr_info); + switch (ret) { + case 0: + case 1: + return ret; + case KVM_MSR_RET_UNHANDLED: + ret = 0; + break; + } + switch (msr_index) { case MSR_EFER: ret = kvm_set_msr_common(vcpu, msr_info); @@ -2620,6 +2684,48 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } +static int setup_zhaoxin_vmcs_controls(struct vmcs_config *vmcs_conf) +{ + u32 zx_ext_vmcs_cap, msr_high, ign; + u32 zx_ctl3 = 0; + int ret; + + if (!is_zhaoxin_cpu()) + return 0; + + /* + * Zhaoxin uses MSR_ZX_EXT_VMCS_CAPS to enumerate the 3rd CPU-based + * control, rather than a bit in the 2nd CPU-based control. + */ + rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &zx_ext_vmcs_cap, &ign); + if (!(zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3)) + return 0; + + ret = rdmsr_safe(MSR_ZX_VMX_PROCBASED_CTLS3, &ign, &msr_high); + if (msr_high & 0x1) { + /* ZX CPU with ZX_VMX_PROCBASED_CTLS3 support */ + ret = adjust_vmx_controls( + 0, + 1, /* now only support bit1 */ + MSR_ZX_VMX_PROCBASED_CTLS3, + &zx_ctl3); + + if (ret) + return -EIO; + } else { + /* ZX CPU without ZX_VMX_PROCBASED_CTLS3 support: + * assume PAUSEOPT is supported and set that bit + */ + zx_ctl3 |= ZX_TERTIARY_EXEC_GUEST_PAUSEOPT; + } + + /* Will be exetended in the future for more 3rd controls */ + + vmcs_conf->zx_cpu_based_3rd_exec_ctrl = zx_ctl3; + + return 0; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2826,6 +2932,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; + if (setup_zhaoxin_vmcs_controls(vmcs_conf)) + return -EIO; + + #if IS_ENABLED(CONFIG_HYPERV) if (enlightened_vmcs) evmcs_sanitize_exec_ctrls(vmcs_conf); @@ -4434,6 +4544,26 @@ static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) return exec_control; } +/* + * We might need to modify the way the third level control corrections + * are handled here in the future by introducing a check using the + * CTLS3 MSR. The current hardware does not include the design for CTLS3, + * but the designer is attempting to add this MSR implementation + * through ucode. + */ +static u32 zx_vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + struct kvm_vcpu *vcpu = &vmx->vcpu; + u32 exec_control = vmcs_config.zx_cpu_based_3rd_exec_ctrl; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_PAUSEOPT)) + exec_control &= ~ZX_TERTIARY_EXEC_GUEST_PAUSEOPT; + + /* Adjust other features here */ + + return exec_control; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4616,6 +4746,25 @@ static int vmx_vcpu_precreate(struct kvm *kvm) #define VMX_XSS_EXIT_BITMAP 0 +static void zx_setup_3rd_ctrls(struct vcpu_vmx *vmx) +{ + if (cpu_has_zx_tertiary_exec_ctrls()) { + zx_tertiary_exec_controls_set(vmx, zx_vmx_tertiary_exec_control(vmx)); + /* + * Regardless of whether the guest has PAUSEOPT support or not, + * as long as there is a 3rd control, we need to initialize this + * field to 0 + */ + if (cpu_has_vmx_pauseopt()) + vmcs_write64(PAUSEOPT_TARGET_TSC, 0); + } +} + +static void zx_init_vmcs(struct vcpu_vmx *vmx) +{ + zx_setup_3rd_ctrls(vmx); +} + /* * Noting that the initialization of Guest-state Area of VMCS is in * vmx_vcpu_reset(). @@ -4646,6 +4795,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_tertiary_exec_ctrls()) tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + zx_init_vmcs(vmx); + if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); @@ -6250,6 +6401,17 @@ void dump_vmcs(void) else tertiary_exec_control = 0; + if (cpu_has_zx_tertiary_exec_ctrls()) { + /* + * Now zhaoxin only support specific vmcs fields on 3rd exec control, + * may exetend in the future. + */ + pr_err("*** Zhaoxin Specific Fields ***\n"); + pr_err("Zhaoxin TertiaryExec Cntl = 0x%016x\n", + vmcs_read32(ZX_TERTIARY_VM_EXEC_CONTROL)); + pr_err("PAUSEOPT Saved TSC = 0x%016llx\n", vmcs_read64(PAUSEOPT_TARGET_TSC)); + } + pr_err("*** Guest State ***\n"); pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), @@ -7192,6 +7354,57 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, instrumentation_end(); } +static bool is_vmexit_during_pauseopt(struct kvm_vcpu *vcpu) +{ + uint8_t opcode[4]; + gpa_t gpa; + unsigned long rip; + const u32 pauseopt_opcode = 0xD0A60FF2; + u32 code; + + rip = kvm_rip_read(vcpu); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, (gva_t)rip, NULL); + if (gpa == UNMAPPED_GVA) + return false; + + if (kvm_vcpu_read_guest(vcpu, gpa, opcode, 4) != 0) + return false; + + code = le32_to_cpu(*(u32 *)opcode); + if (code == pauseopt_opcode) + return true; + + return false; +} + +static void zx_vmx_vcpu_run_pre(struct kvm_vcpu *vcpu) +{ + unsigned long new_rip; + + if (vcpu->arch.pauseopt_interrupted) { + new_rip = kvm_rip_read(vcpu); + if (new_rip != vcpu->arch.pauseopt_rip) { + /* + * When the execution of PAUSEOPT in the guest is interrupted by + * other events, causing a vmexit, the pauseopt target tsc should be + * cleared to zero before the next vmentry if guest rip changed, + * avoiding re-enter pauseopt optimized state after enter guest. + */ + vmcs_write64(PAUSEOPT_TARGET_TSC, 0); + vcpu->arch.pauseopt_interrupted = false; + vcpu->arch.pauseopt_rip = 0; + } + } +} + +static void zx_vmx_vcpu_run_post(struct kvm_vcpu *vcpu) +{ + if (cpu_has_vmx_pauseopt() && is_vmexit_during_pauseopt(vcpu)) { + vcpu->arch.pauseopt_interrupted = true; + vcpu->arch.pauseopt_rip = kvm_rip_read(vcpu); + } +} + static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; @@ -7225,6 +7438,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + if (is_zhaoxin_cpu()) + zx_vmx_vcpu_run_pre(vcpu); + cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); @@ -7318,6 +7534,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->launched = 1; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + if (is_zhaoxin_cpu()) + zx_vmx_vcpu_run_post(vcpu); + vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); @@ -8022,6 +8241,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx); } + zx_setup_3rd_ctrls(vmx); + if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | @@ -8076,6 +8297,12 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } +static void zx_vmx_set_cpu_caps(void) +{ + if (cpu_has_vmx_pauseopt()) + kvm_cpu_cap_check_and_set(X86_FEATURE_PAUSEOPT); +} + static __init void vmx_set_cpu_caps(void) { kvm_set_cpu_caps(); @@ -8121,6 +8348,8 @@ static __init void vmx_set_cpu_caps(void) if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); + zx_vmx_set_cpu_caps(); + if (!cpu_has_vmx_pasid_trans()) kvm_cpu_cap_clear(X86_FEATURE_ENQCMD); else if (kvm_cpu_cap_has(X86_FEATURE_ENQCMD)) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 61d357c2015d102247cbad0cf1b2195d2c5294e7..567c8bc9ba82a012d3614ac90a2235672cad8f5b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -232,6 +232,7 @@ struct vcpu_vmx { u64 spec_ctrl; u32 msr_ia32_umwait_control; + u32 msr_pauseopt_control; u32 secondary_exec_control; @@ -515,6 +516,12 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } +static inline bool vmx_guest_pauseopt_enabled(struct vcpu_vmx *vmx) +{ + return zx_tertiary_exec_controls_get(vmx) & + ZX_TERTIARY_EXEC_GUEST_PAUSEOPT; +} + static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) { if (!enable_ept) @@ -543,4 +550,13 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && enable_ipiv; } +static inline bool is_zhaoxin_cpu(void) +{ + /* Now zhaoxin owns 2 x86 vendor brands, Zhaoxin and Centaur */ + return (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN || + boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR); +} + +#define KVM_MSR_RET_UNHANDLED 2 + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b3f2af065c7db3d07da04991d8ddedf2155cab99..9f6f4524a21f101dc445f9c2808ddbe40f4ab856 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6166,6 +6166,10 @@ static void kvm_init_msr_list(void) if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) continue; break; + case MSR_PAUSEOPT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_PAUSEOPT)) + continue; + break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 0e65d00e2339ff95e022d03ee9012346d2ead8b3..96bf5b3baacd399f69d662fabe140b7cf739bfc6 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -117,6 +117,23 @@ static void delay_halt_tpause(u64 start, u64 cycles) __tpause(TPAUSE_C02_STATE, edx, eax); } +/* + * On ZHAOXIN the PAUSEOPT instruction waits until any of: + * 1) the delta of TSC counter exceeds the value provided in EDX:EAX + * 2) global timeout in PAUSEOPT_CONTROL is exceeded + * 3) an external interrupt occurs + */ +static void delay_halt_pauseopt(u64 unused, u64 cycles) +{ + u64 until = cycles; + u32 eax, edx; + + eax = lower_32_bits(until); + edx = upper_32_bits(until); + + __pauseopt(PAUSEOPT_P01_STATE, edx, eax); +} + /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that * counts with TSC frequency. The input value is the number of TSC cycles @@ -183,6 +200,12 @@ void __init use_tpause_delay(void) delay_fn = delay_halt; } +void __init use_pauseopt_delay(void) +{ + delay_halt_fn = delay_halt_pauseopt; + delay_fn = delay_halt; +} + void use_mwaitx_delay(void) { delay_halt_fn = delay_halt_mwaitx; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index b8d9a015f1ecf8170ad52fc517344bbb25e1d9b8..ef7c697bb4d4e43eb49d67f2226166feea5a1e1d 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -156,6 +156,7 @@ #define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ #define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ #define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ +#define X86_FEATURE_PAUSEOPT (5*32 + 15) /* PAUSEOPT instruction is present*/ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ #define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 5d119b26a6218610fe15b195a8e39108f46bdd44..251196a0cf83fe19cb01e5875c0760c9a74847d8 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -76,6 +76,14 @@ */ #define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) +#define MSR_PAUSEOPT_CONTROL 0x187f + +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_PAUSEOPT_CONTROL_TIME_MASK (~0x03U) + /* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ #define MSR_IA32_CORE_CAPS 0x000000cf #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5 diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index b8ff9e8ac0d516b183eaf2fc64bcd0ddbaf3b15c..8dddeff4a0105074c8bbe52ff7e8a145e70c3236 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -88,6 +88,7 @@ #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 +#define EXIT_REASON_PAUSEOPT 68 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \