From a1ff38fac93d533b1397a92b485cafe84b30d094 Mon Sep 17 00:00:00 2001 From: "A. Wilcox" Date: Fri, 16 Feb 2024 00:24:20 -0600 Subject: system/easy-kernel: Update to 6.6.6-mc1 --- .../0204-amd-deserialised-MSR-access.patch | 134 +++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 system/easy-kernel/0204-amd-deserialised-MSR-access.patch (limited to 'system/easy-kernel/0204-amd-deserialised-MSR-access.patch') diff --git a/system/easy-kernel/0204-amd-deserialised-MSR-access.patch b/system/easy-kernel/0204-amd-deserialised-MSR-access.patch new file mode 100644 index 000000000..cd8c4237a --- /dev/null +++ b/system/easy-kernel/0204-amd-deserialised-MSR-access.patch @@ -0,0 +1,134 @@ +From: Borislav Petkov +Date: Thu, 22 Jun 2023 11:52:12 +0200 +Subject: [PATCH 07/16] x86/barrier: Do not serialize MSR accesses on AMD + +AMD does not have the requirement for a synchronization barrier when +acccessing a certain group of MSRs. Do not incur that unnecessary +penalty there. + +While at it, move to processor.h to avoid include hell. Untangling that +file properly is a matter for another day. + +Some notes on the performance aspect of why this is relevant, courtesy +of Kishon VijayAbraham : + +On a AMD Zen4 system with 96 cores, a modified ipi-bench[1] on a VM +shows x2AVIC IPI rate is 3% to 4% lower than AVIC IPI rate. The +ipi-bench is modified so that the IPIs are sent between two vCPUs in the +same CCX. This also requires to pin the vCPU to a physical core to +prevent any latencies. This simulates the use case of pinning vCPUs to +the thread of a single CCX to avoid interrupt IPI latency. + +In order to avoid run-to-run variance (for both x2AVIC and AVIC), the +below configurations are done: + + 1) Disable Power States in BIOS (to prevent the system from going to + lower power state) + + 2) Run the system at fixed frequency 2500MHz (to prevent the system + from increasing the frequency when the load is more) + +With the above configuration: + +*) Performance measured using ipi-bench for AVIC: + Average Latency: 1124.98ns [Time to send IPI from one vCPU to another vCPU] + + Cumulative throughput: 42.6759M/s [Total number of IPIs sent in a second from + 48 vCPUs simultaneously] + +*) Performance measured using ipi-bench for x2AVIC: + Average Latency: 1172.42ns [Time to send IPI from one vCPU to another vCPU] + + Cumulative throughput: 40.9432M/s [Total number of IPIs sent in a second from + 48 vCPUs simultaneously] + +From above, x2AVIC latency is ~4% more than AVIC. However, the expectation is +x2AVIC performance to be better or equivalent to AVIC. Upon analyzing +the perf captures, it is observed significant time is spent in +weak_wrmsr_fence() invoked by x2apic_send_IPI(). + +With the fix to skip weak_wrmsr_fence() + +*) Performance measured using ipi-bench for x2AVIC: + Average Latency: 1117.44ns [Time to send IPI from one vCPU to another vCPU] + + Cumulative throughput: 42.9608M/s [Total number of IPIs sent in a second from + 48 vCPUs simultaneously] + +Comparing the performance of x2AVIC with and without the fix, it can be seen +the performance improves by ~4%. + +Performance captured using an unmodified ipi-bench using the 'mesh-ipi' option +with and without weak_wrmsr_fence() on a Zen4 system also showed significant +performance improvement without weak_wrmsr_fence(). The 'mesh-ipi' option ignores +CCX or CCD and just picks random vCPU. + + Average throughput (10 iterations) with weak_wrmsr_fence(), + Cumulative throughput: 4933374 IPI/s + + Average throughput (10 iterations) without weak_wrmsr_fence(), + Cumulative throughput: 6355156 IPI/s + +[1] https://github.com/bytedance/kvm-utils/tree/master/microbenchmark/ipi-bench + +Signed-off-by: Borislav Petkov (AMD) +--- + arch/x86/include/asm/barrier.h | 18 ------------------ + arch/x86/include/asm/processor.h | 19 +++++++++++++++++++ + 2 files changed, 19 insertions(+), 18 deletions(-) + +diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h +index 35389b2af..0216f63a3 100644 +--- a/arch/x86/include/asm/barrier.h ++++ b/arch/x86/include/asm/barrier.h +@@ -81,22 +81,4 @@ do { \ + + #include + +-/* +- * Make previous memory operations globally visible before +- * a WRMSR. +- * +- * MFENCE makes writes visible, but only affects load/store +- * instructions. WRMSR is unfortunately not a load/store +- * instruction and is unaffected by MFENCE. The LFENCE ensures +- * that the WRMSR is not reordered. +- * +- * Most WRMSRs are full serializing instructions themselves and +- * do not require this barrier. This is only required for the +- * IA32_TSC_DEADLINE and X2APIC MSRs. +- */ +-static inline void weak_wrmsr_fence(void) +-{ +- asm volatile("mfence; lfence" : : : "memory"); +-} +- + #endif /* _ASM_X86_BARRIER_H */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index a3669a777..3e175d554 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -734,4 +734,23 @@ bool arch_is_platform_page(u64 paddr); + + extern bool gds_ucode_mitigated(void); + ++/* ++ * Make previous memory operations globally visible before ++ * a WRMSR. ++ * ++ * MFENCE makes writes visible, but only affects load/store ++ * instructions. WRMSR is unfortunately not a load/store ++ * instruction and is unaffected by MFENCE. The LFENCE ensures ++ * that the WRMSR is not reordered. ++ * ++ * Most WRMSRs are full serializing instructions themselves and ++ * do not require this barrier. This is only required for the ++ * IA32_TSC_DEADLINE and X2APIC MSRs. ++ */ ++static inline void weak_wrmsr_fence(void) ++{ ++ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ++ asm volatile("mfence; lfence" : : : "memory"); ++} ++ + #endif /* _ASM_X86_PROCESSOR_H */ -- cgit v1.2.3-70-g09d2