aboutsummaryrefslogtreecommitdiffstats
path: root/emulators
diff options
context:
space:
mode:
authorroyger <royger@FreeBSD.org>2018-08-16 17:02:02 +0800
committerroyger <royger@FreeBSD.org>2018-08-16 17:02:02 +0800
commit22dfdca6df09b2d5fd27e77671fbc63efe9d6b11 (patch)
tree6216f2f4c97ad2ec1e18f3e52e8ddadfc490c7c7 /emulators
parentef12527ef0e20b7763f76ecd6dadf9af46d8e503 (diff)
downloadfreebsd-ports-gnome-22dfdca6df09b2d5fd27e77671fbc63efe9d6b11.tar.gz
freebsd-ports-gnome-22dfdca6df09b2d5fd27e77671fbc63efe9d6b11.tar.zst
freebsd-ports-gnome-22dfdca6df09b2d5fd27e77671fbc63efe9d6b11.zip
xen411: apply fixes for XSA-269, XSA-272 and XSA-273
Diffstat (limited to 'emulators')
-rw-r--r--emulators/xen-kernel411/Makefile45
-rw-r--r--emulators/xen-kernel411/files/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch213
-rw-r--r--emulators/xen-kernel411/files/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch51
-rw-r--r--emulators/xen-kernel411/files/0003-x86-spec-ctrl-command-line-handling-adjustments.patch45
-rw-r--r--emulators/xen-kernel411/files/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch66
-rw-r--r--emulators/xen-kernel411/files/0006-allow-cpu_down-to-be-called-earlier.patch58
-rw-r--r--emulators/xen-kernel411/files/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch109
-rw-r--r--emulators/xen-kernel411/files/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch55
-rw-r--r--emulators/xen-kernel411/files/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch121
-rw-r--r--emulators/xen-kernel411/files/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch423
-rw-r--r--emulators/xen-kernel411/files/0011-x86-possibly-bring-up-all-CPUs-even-if-not-all-are-s.patch174
-rw-r--r--emulators/xen-kernel411/files/0012-x86-command-line-option-to-avoid-use-of-secondary-hy.patch126
-rw-r--r--emulators/xen-kernel411/files/0013-x86-vmx-Don-t-clobber-dr6-while-debugging-state-is-l.patch38
-rw-r--r--emulators/xen-kernel411/files/0014-x86-xstate-Use-a-guests-CPUID-policy-rather-than-all.patch125
-rw-r--r--emulators/xen-kernel411/files/0015-x86-xstate-Make-errors-in-xstate-calculations-more-o.patch64
-rw-r--r--emulators/xen-kernel411/files/0016-x86-hvm-Disallow-unknown-MSR_EFER-bits.patch48
-rw-r--r--emulators/xen-kernel411/files/0017-x86-spec-ctrl-Fix-the-parsing-of-xpti-on-fixed-Intel.patch83
-rw-r--r--emulators/xen-kernel411/files/0018-x86-spec-ctrl-Yet-more-fixes-for-xpti-parsing.patch89
-rw-r--r--emulators/xen-kernel411/files/0019-x86-vmx-Fix-handing-of-MSR_DEBUGCTL-on-VMExit.patch281
-rw-r--r--emulators/xen-kernel411/files/0020-x86-vmx-Defer-vmx_vmcs_exit-as-long-as-possible-in-c.patch63
-rw-r--r--emulators/xen-kernel411/files/0021-x86-vmx-API-improvements-for-MSR-load-save-infrastru.patch309
-rw-r--r--emulators/xen-kernel411/files/0022-x86-vmx-Internal-cleanup-for-MSR-load-save-infrastru.patch171
-rw-r--r--emulators/xen-kernel411/files/0023-x86-vmx-Factor-locate_msr_entry-out-of-vmx_find_msr-.patch104
-rw-r--r--emulators/xen-kernel411/files/0024-x86-vmx-Support-remote-access-to-the-MSR-lists.patch354
-rw-r--r--emulators/xen-kernel411/files/0025-x86-vmx-Improvements-to-LBR-MSR-handling.patch176
-rw-r--r--emulators/xen-kernel411/files/0026-x86-vmx-Pass-an-MSR-value-into-vmx_msr_add.patch148
-rw-r--r--emulators/xen-kernel411/files/0027-x86-vmx-Support-load-only-guest-MSR-list-entries.patch208
-rw-r--r--emulators/xen-kernel411/files/0028-VMX-fix-vmx_-find-del-_msr-build.patch61
-rw-r--r--emulators/xen-kernel411/files/0029-ARM-disable-grant-table-v2.patch66
-rw-r--r--emulators/xen-kernel411/files/0030-x86-vtx-Fix-the-checking-for-unknown-invalid-MSR_DEB.patch133
-rw-r--r--emulators/xen-kernel411/files/0032-x86-spec-ctrl-Calculate-safe-PTE-addresses-for-L1TF-.patch313
-rw-r--r--emulators/xen-kernel411/files/0033-x86-spec-ctrl-Introduce-an-option-to-control-L1TF-mi.patch226
-rw-r--r--emulators/xen-kernel411/files/0034-x86-shadow-Infrastructure-to-force-a-PV-guest-into-s.patch277
-rw-r--r--emulators/xen-kernel411/files/0035-x86-mm-Plumbing-to-allow-any-PTE-update-to-fail-with.patch255
-rw-r--r--emulators/xen-kernel411/files/0036-x86-pv-Force-a-guest-into-shadow-mode-when-it-writes.patch267
-rw-r--r--emulators/xen-kernel411/files/0037-x86-spec-ctrl-CPUID-MSR-definitions-for-L1D_FLUSH.patch134
-rw-r--r--emulators/xen-kernel411/files/0038-x86-msr-Virtualise-MSR_FLUSH_CMD-for-guests.patch103
-rw-r--r--emulators/xen-kernel411/files/0039-x86-spec-ctrl-Introduce-an-option-to-control-L1D_FLU.patch188
-rw-r--r--emulators/xen-kernel411/files/0040-x86-Make-spec-ctrl-no-a-global-disable-of-all-mitiga.patch69
-rw-r--r--emulators/xen-kernel411/files/0042-x86-write-to-correct-variable-in-parse_pv_l1tf.patch31
40 files changed, 5869 insertions, 1 deletions
diff --git a/emulators/xen-kernel411/Makefile b/emulators/xen-kernel411/Makefile
index 90d83c62ee88..47a29d864ee9 100644
--- a/emulators/xen-kernel411/Makefile
+++ b/emulators/xen-kernel411/Makefile
@@ -2,7 +2,7 @@
PORTNAME= xen
PORTVERSION= 4.11.0
-PORTREVISION= 0
+PORTREVISION= 1
CATEGORIES= emulators
MASTER_SITES= http://downloads.xenproject.org/release/xen/${PORTVERSION}/
PKGNAMESUFFIX= -kernel411
@@ -47,6 +47,49 @@ EXTRA_PATCHES+= ${FILESDIR}/0001-x86-replace-usage-in-the-linker-script.patch:-p
${FILESDIR}/0002-x86-efi-split-compiler-vs-linker-support.patch:-p1
# Fix PVH Dom0 build with shadow paging
EXTRA_PATCHES+= ${FILESDIR}/0001-x86-pvh-change-the-order-of-the-iommu-initialization.patch:-p1
+# XSA-269 (MSR_DEBUGCTL handling) and XSA-273 (L1TF)
+# Note that due to the high value of patches needed to fix L1TF the package is
+# brought up to the state of the staging-4.11 branch. This can be removed when
+# 4.11.1 is released.
+EXTRA_PATCHES+= ${FILESDIR}/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch:-p1 \
+ ${FILESDIR}/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch:-p1 \
+ ${FILESDIR}/0003-x86-spec-ctrl-command-line-handling-adjustments.patch:-p1 \
+ ${FILESDIR}/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch:-p1 \
+ ${FILESDIR}/0006-allow-cpu_down-to-be-called-earlier.patch:-p1 \
+ ${FILESDIR}/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch:-p1 \
+ ${FILESDIR}/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch:-p1 \
+ ${FILESDIR}/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch:-p1 \
+ ${FILESDIR}/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch:-p1 \
+ ${FILESDIR}/0011-x86-possibly-bring-up-all-CPUs-even-if-not-all-are-s.patch:-p1 \
+ ${FILESDIR}/0012-x86-command-line-option-to-avoid-use-of-secondary-hy.patch:-p1 \
+ ${FILESDIR}/0013-x86-vmx-Don-t-clobber-dr6-while-debugging-state-is-l.patch:-p1 \
+ ${FILESDIR}/0014-x86-xstate-Use-a-guests-CPUID-policy-rather-than-all.patch:-p1 \
+ ${FILESDIR}/0015-x86-xstate-Make-errors-in-xstate-calculations-more-o.patch:-p1 \
+ ${FILESDIR}/0016-x86-hvm-Disallow-unknown-MSR_EFER-bits.patch:-p1 \
+ ${FILESDIR}/0017-x86-spec-ctrl-Fix-the-parsing-of-xpti-on-fixed-Intel.patch:-p1 \
+ ${FILESDIR}/0018-x86-spec-ctrl-Yet-more-fixes-for-xpti-parsing.patch:-p1 \
+ ${FILESDIR}/0019-x86-vmx-Fix-handing-of-MSR_DEBUGCTL-on-VMExit.patch:-p1 \
+ ${FILESDIR}/0020-x86-vmx-Defer-vmx_vmcs_exit-as-long-as-possible-in-c.patch:-p1 \
+ ${FILESDIR}/0021-x86-vmx-API-improvements-for-MSR-load-save-infrastru.patch:-p1 \
+ ${FILESDIR}/0022-x86-vmx-Internal-cleanup-for-MSR-load-save-infrastru.patch:-p1 \
+ ${FILESDIR}/0023-x86-vmx-Factor-locate_msr_entry-out-of-vmx_find_msr-.patch:-p1 \
+ ${FILESDIR}/0024-x86-vmx-Support-remote-access-to-the-MSR-lists.patch:-p1 \
+ ${FILESDIR}/0025-x86-vmx-Improvements-to-LBR-MSR-handling.patch:-p1 \
+ ${FILESDIR}/0026-x86-vmx-Pass-an-MSR-value-into-vmx_msr_add.patch:-p1 \
+ ${FILESDIR}/0027-x86-vmx-Support-load-only-guest-MSR-list-entries.patch:-p1 \
+ ${FILESDIR}/0028-VMX-fix-vmx_-find-del-_msr-build.patch:-p1 \
+ ${FILESDIR}/0029-ARM-disable-grant-table-v2.patch:-p1 \
+ ${FILESDIR}/0030-x86-vtx-Fix-the-checking-for-unknown-invalid-MSR_DEB.patch:-p1 \
+ ${FILESDIR}/0032-x86-spec-ctrl-Calculate-safe-PTE-addresses-for-L1TF-.patch:-p1 \
+ ${FILESDIR}/0033-x86-spec-ctrl-Introduce-an-option-to-control-L1TF-mi.patch:-p1 \
+ ${FILESDIR}/0034-x86-shadow-Infrastructure-to-force-a-PV-guest-into-s.patch:-p1 \
+ ${FILESDIR}/0035-x86-mm-Plumbing-to-allow-any-PTE-update-to-fail-with.patch:-p1 \
+ ${FILESDIR}/0036-x86-pv-Force-a-guest-into-shadow-mode-when-it-writes.patch:-p1 \
+ ${FILESDIR}/0037-x86-spec-ctrl-CPUID-MSR-definitions-for-L1D_FLUSH.patch:-p1 \
+ ${FILESDIR}/0038-x86-msr-Virtualise-MSR_FLUSH_CMD-for-guests.patch:-p1 \
+ ${FILESDIR}/0039-x86-spec-ctrl-Introduce-an-option-to-control-L1D_FLU.patch:-p1 \
+ ${FILESDIR}/0040-x86-Make-spec-ctrl-no-a-global-disable-of-all-mitiga.patch:-p1 \
+ ${FILESDIR}/0042-x86-write-to-correct-variable-in-parse_pv_l1tf.patch:-p1
.include <bsd.port.options.mk>
diff --git a/emulators/xen-kernel411/files/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch b/emulators/xen-kernel411/files/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch
new file mode 100644
index 000000000000..b0ee283d1fea
--- /dev/null
+++ b/emulators/xen-kernel411/files/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch
@@ -0,0 +1,213 @@
+From e932371d6ae0f69b89abb2dce725483c75356de2 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 30 Jul 2018 11:17:27 +0200
+Subject: [PATCH 01/42] xen: Port the array_index_nospec() infrastructure from
+ Linux
+
+This is as the infrastructure appeared in Linux 4.17, adapted slightly for
+Xen.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Julien Grall <julien.grall@arm.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: 2ddfae51d8b1d7b8cd33a4f6ad4d16d27cb869ae
+master date: 2018-07-06 16:49:57 +0100
+---
+ xen/include/asm-arm/arm32/system.h | 18 ++++++++
+ xen/include/asm-arm/arm64/system.h | 22 ++++++++++
+ xen/include/asm-x86/system.h | 24 ++++++++++
+ xen/include/xen/compiler.h | 3 ++
+ xen/include/xen/nospec.h | 70 ++++++++++++++++++++++++++++++
+ 5 files changed, 137 insertions(+)
+ create mode 100644 xen/include/xen/nospec.h
+
+diff --git a/xen/include/asm-arm/arm32/system.h b/xen/include/asm-arm/arm32/system.h
+index c617b40438..ab57abfbc5 100644
+--- a/xen/include/asm-arm/arm32/system.h
++++ b/xen/include/asm-arm/arm32/system.h
+@@ -48,6 +48,24 @@ static inline int local_fiq_is_enabled(void)
+ return !(flags & PSR_FIQ_MASK);
+ }
+
++#define CSDB ".inst 0xe320f014"
++
++static inline unsigned long array_index_mask_nospec(unsigned long idx,
++ unsigned long sz)
++{
++ unsigned long mask;
++
++ asm volatile( "cmp %1, %2\n"
++ "sbc %0, %1, %1\n"
++ CSDB
++ : "=r" (mask)
++ : "r" (idx), "Ir" (sz)
++ : "cc" );
++
++ return mask;
++}
++#define array_index_mask_nospec array_index_mask_nospec
++
+ #endif
+ /*
+ * Local variables:
+diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h
+index 2e2ee212a1..2e36573ac6 100644
+--- a/xen/include/asm-arm/arm64/system.h
++++ b/xen/include/asm-arm/arm64/system.h
+@@ -58,6 +58,28 @@ static inline int local_fiq_is_enabled(void)
+ return !(flags & PSR_FIQ_MASK);
+ }
+
++#define csdb() asm volatile ( "hint #20" : : : "memory" )
++
++/*
++ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz
++ * and 0 otherwise.
++ */
++static inline unsigned long array_index_mask_nospec(unsigned long idx,
++ unsigned long sz)
++{
++ unsigned long mask;
++
++ asm volatile ( "cmp %1, %2\n"
++ "sbc %0, xzr, xzr\n"
++ : "=r" (mask)
++ : "r" (idx), "Ir" (sz)
++ : "cc" );
++ csdb();
++
++ return mask;
++}
++#define array_index_mask_nospec array_index_mask_nospec
++
+ #endif
+ /*
+ * Local variables:
+diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
+index 43fb6fe489..483cd20afd 100644
+--- a/xen/include/asm-x86/system.h
++++ b/xen/include/asm-x86/system.h
+@@ -221,6 +221,30 @@ static always_inline unsigned long __xadd(
+ #define set_mb(var, value) do { xchg(&var, value); } while (0)
+ #define set_wmb(var, value) do { var = value; smp_wmb(); } while (0)
+
++/**
++ * array_index_mask_nospec() - generate a mask that is ~0UL when the
++ * bounds check succeeds and 0 otherwise
++ * @index: array element index
++ * @size: number of elements in array
++ *
++ * Returns:
++ * 0 - (index < size)
++ */
++static inline unsigned long array_index_mask_nospec(unsigned long index,
++ unsigned long size)
++{
++ unsigned long mask;
++
++ asm volatile ( "cmp %[size], %[index]; sbb %[mask], %[mask];"
++ : [mask] "=r" (mask)
++ : [size] "g" (size), [index] "r" (index) );
++
++ return mask;
++}
++
++/* Override default implementation in nospec.h. */
++#define array_index_mask_nospec array_index_mask_nospec
++
+ #define local_irq_disable() asm volatile ( "cli" : : : "memory" )
+ #define local_irq_enable() asm volatile ( "sti" : : : "memory" )
+
+diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h
+index 533a8ea0f3..a7e05681c9 100644
+--- a/xen/include/xen/compiler.h
++++ b/xen/include/xen/compiler.h
+@@ -81,6 +81,9 @@
+ #pragma GCC visibility push(hidden)
+ #endif
+
++/* Make the optimizer believe the variable can be manipulated arbitrarily. */
++#define OPTIMIZER_HIDE_VAR(var) __asm__ ( "" : "+g" (var) )
++
+ /* This macro obfuscates arithmetic on a variable address so that gcc
+ shouldn't recognize the original var, and make assumptions about it */
+ /*
+diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h
+new file mode 100644
+index 0000000000..48793996e8
+--- /dev/null
++++ b/xen/include/xen/nospec.h
+@@ -0,0 +1,70 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/* Copyright(c) 2018 Linus Torvalds. All rights reserved. */
++/* Copyright(c) 2018 Alexei Starovoitov. All rights reserved. */
++/* Copyright(c) 2018 Intel Corporation. All rights reserved. */
++/* Copyright(c) 2018 Citrix Systems R&D Ltd. All rights reserved. */
++
++#ifndef XEN_NOSPEC_H
++#define XEN_NOSPEC_H
++
++#include <asm/system.h>
++
++/**
++ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
++ * @index: array element index
++ * @size: number of elements in array
++ *
++ * When @index is out of bounds (@index >= @size), the sign bit will be
++ * set. Extend the sign bit to all bits and invert, giving a result of
++ * zero for an out of bounds index, or ~0 if within bounds [0, @size).
++ */
++#ifndef array_index_mask_nospec
++static inline unsigned long array_index_mask_nospec(unsigned long index,
++ unsigned long size)
++{
++ /*
++ * Always calculate and emit the mask even if the compiler
++ * thinks the mask is not needed. The compiler does not take
++ * into account the value of @index under speculation.
++ */
++ OPTIMIZER_HIDE_VAR(index);
++ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
++}
++#endif
++
++/*
++ * array_index_nospec - sanitize an array index after a bounds check
++ *
++ * For a code sequence like:
++ *
++ * if (index < size) {
++ * index = array_index_nospec(index, size);
++ * val = array[index];
++ * }
++ *
++ * ...if the CPU speculates past the bounds check then
++ * array_index_nospec() will clamp the index within the range of [0,
++ * size).
++ */
++#define array_index_nospec(index, size) \
++({ \
++ typeof(index) _i = (index); \
++ typeof(size) _s = (size); \
++ unsigned long _mask = array_index_mask_nospec(_i, _s); \
++ \
++ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \
++ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \
++ \
++ (typeof(_i)) (_i & _mask); \
++})
++
++#endif /* XEN_NOSPEC_H */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch b/emulators/xen-kernel411/files/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch
new file mode 100644
index 000000000000..f131cf67cfc8
--- /dev/null
+++ b/emulators/xen-kernel411/files/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch
@@ -0,0 +1,51 @@
+From da33530ab393dcc04d3e35424956277669b8d8ce Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 30 Jul 2018 11:18:54 +0200
+Subject: [PATCH 02/42] x86: correctly set nonlazy_xstate_used when loading
+ full state
+
+In this case, just like xcr0_accum, nonlazy_xstate_used should always be
+set to the intended new value, rather than possibly leaving the flag set
+from a prior state load.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: f46bf0e101ca63118b9db2616e8f51e972d7f563
+master date: 2018-07-09 10:51:02 +0200
+---
+ xen/arch/x86/domctl.c | 3 +--
+ xen/arch/x86/hvm/hvm.c | 3 +--
+ 2 files changed, 2 insertions(+), 4 deletions(-)
+
+diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
+index 8fbbf3aeb3..b04388d663 100644
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -1187,8 +1187,7 @@ long arch_do_domctl(
+ vcpu_pause(v);
+ v->arch.xcr0 = _xcr0;
+ v->arch.xcr0_accum = _xcr0_accum;
+- if ( _xcr0_accum & XSTATE_NONLAZY )
+- v->arch.nonlazy_xstate_used = 1;
++ v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY;
+ compress_xsave_states(v, _xsave_area,
+ evc->size - PV_XSAVE_HDR_SIZE);
+ vcpu_unpause(v);
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index c23983cdff..279cb88e45 100644
+--- a/xen/arch/x86/hvm/hvm.c
++++ b/xen/arch/x86/hvm/hvm.c
+@@ -1324,8 +1324,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
+
+ v->arch.xcr0 = ctxt->xcr0;
+ v->arch.xcr0_accum = ctxt->xcr0_accum;
+- if ( ctxt->xcr0_accum & XSTATE_NONLAZY )
+- v->arch.nonlazy_xstate_used = 1;
++ v->arch.nonlazy_xstate_used = ctxt->xcr0_accum & XSTATE_NONLAZY;
+ compress_xsave_states(v, &ctxt->save_area,
+ size - offsetof(struct hvm_hw_cpu_xsave, save_area));
+
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0003-x86-spec-ctrl-command-line-handling-adjustments.patch b/emulators/xen-kernel411/files/0003-x86-spec-ctrl-command-line-handling-adjustments.patch
new file mode 100644
index 000000000000..55bd9ebd3769
--- /dev/null
+++ b/emulators/xen-kernel411/files/0003-x86-spec-ctrl-command-line-handling-adjustments.patch
@@ -0,0 +1,45 @@
+From 4bdeedbd611c59f07878eb22955f655a81452835 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 30 Jul 2018 11:19:41 +0200
+Subject: [PATCH 03/42] x86/spec-ctrl: command line handling adjustments
+
+For one, "no-xen" should not imply "no-eager-fpu", as "eager FPU" mode
+is to guard guests, not Xen itself, which is also expressed so by
+print_details().
+
+And then opt_ssbd, despite being off by default, should also be cleared
+by the "no" and "no-xen" sub-options.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: ac3f9a72141a48d40fabfff561d5a7dc0e1b810d
+master date: 2018-07-10 12:22:31 +0200
+---
+ xen/arch/x86/spec_ctrl.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 08e6784c4c..73dc7170c7 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -124,6 +124,8 @@ static int __init parse_spec_ctrl(const char *s)
+ opt_msr_sc_pv = false;
+ opt_msr_sc_hvm = false;
+
++ opt_eager_fpu = 0;
++
+ disable_common:
+ opt_rsb_pv = false;
+ opt_rsb_hvm = false;
+@@ -131,7 +133,7 @@ static int __init parse_spec_ctrl(const char *s)
+ opt_thunk = THUNK_JMP;
+ opt_ibrs = 0;
+ opt_ibpb = false;
+- opt_eager_fpu = 0;
++ opt_ssbd = false;
+ }
+ else if ( val > 0 )
+ rc = -EINVAL;
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch b/emulators/xen-kernel411/files/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch
new file mode 100644
index 000000000000..14cf7f5cd7c5
--- /dev/null
+++ b/emulators/xen-kernel411/files/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch
@@ -0,0 +1,66 @@
+From ac35e050b64a565fe234dd42e8dac163e946e58d Mon Sep 17 00:00:00 2001
+From: Sergey Dyasli <sergey.dyasli@citrix.com>
+Date: Mon, 30 Jul 2018 11:21:28 +0200
+Subject: [PATCH 05/42] mm/page_alloc: correct first_dirty calculations during
+ block merging
+
+Currently it's possible to hit an assertion in alloc_heap_pages():
+
+Assertion 'first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub)' failed at page_alloc.c:988
+
+This can happen because a piece of logic to calculate first_dirty
+during block merging in free_heap_pages() is missing for the following
+scenario:
+
+1. Current block's first_dirty equals to INVALID_DIRTY_IDX
+2. Successor block is free but its first_dirty != INVALID_DIRTY_IDX
+3. The successor is merged into current block
+4. Current block's first_dirty still equals to INVALID_DIRTY_IDX
+
+This will trigger the assertion during allocation of such block in
+alloc_heap_pages() because there will be pages with PGC_need_scrub
+bit set despite the claim of first_dirty that the block is scrubbed.
+
+Add the missing piece of logic and slightly update the comment for
+the predecessor case to better capture the code's intent.
+
+Fixes 1a37f33ea613 ("mm: Place unscrubbed pages at the end of pagelist")
+
+Signed-off-by: Sergey Dyasli <sergey.dyasli@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+master commit: 1e2df9608857b5355f2ec3b1a34b87a2007dcd16
+master date: 2018-07-12 10:45:11 +0200
+---
+ xen/common/page_alloc.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
+index 20ee1e4897..02aeed7c47 100644
+--- a/xen/common/page_alloc.c
++++ b/xen/common/page_alloc.c
+@@ -1426,7 +1426,7 @@ static void free_heap_pages(
+
+ page_list_del(predecessor, &heap(node, zone, order));
+
+- /* Keep predecessor's first_dirty if it is already set. */
++ /* Update predecessor's first_dirty if necessary. */
+ if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
+ pg->u.free.first_dirty != INVALID_DIRTY_IDX )
+ predecessor->u.free.first_dirty = (1U << order) +
+@@ -1447,6 +1447,12 @@ static void free_heap_pages(
+
+ check_and_stop_scrub(successor);
+
++ /* Update pg's first_dirty if necessary. */
++ if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX &&
++ successor->u.free.first_dirty != INVALID_DIRTY_IDX )
++ pg->u.free.first_dirty = (1U << order) +
++ successor->u.free.first_dirty;
++
+ page_list_del(successor, &heap(node, zone, order));
+ }
+
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0006-allow-cpu_down-to-be-called-earlier.patch b/emulators/xen-kernel411/files/0006-allow-cpu_down-to-be-called-earlier.patch
new file mode 100644
index 000000000000..704fd621f115
--- /dev/null
+++ b/emulators/xen-kernel411/files/0006-allow-cpu_down-to-be-called-earlier.patch
@@ -0,0 +1,58 @@
+From a44cf0c8728e08858638170a057675ca5479fdc7 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 30 Jul 2018 11:22:06 +0200
+Subject: [PATCH 06/42] allow cpu_down() to be called earlier
+
+The function's use of the stop-machine logic has so far prevented its
+use ahead of the processing of the "ordinary" initcalls. Since at this
+early time we're in a controlled environment anyway, there's no need for
+such a heavy tool. Additionally this ought to have less of a performance
+impact especially on large systems, compared to the alternative of
+making stop-machine functionality available earlier.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 5894c0a2da66243a89088d309c7e1ea212ab28d6
+master date: 2018-07-16 15:15:12 +0200
+---
+ xen/common/cpu.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/xen/common/cpu.c b/xen/common/cpu.c
+index 6350f150bd..653a56b840 100644
+--- a/xen/common/cpu.c
++++ b/xen/common/cpu.c
+@@ -67,12 +67,17 @@ void __init register_cpu_notifier(struct notifier_block *nb)
+ spin_unlock(&cpu_add_remove_lock);
+ }
+
+-static int take_cpu_down(void *unused)
++static void _take_cpu_down(void *unused)
+ {
+ void *hcpu = (void *)(long)smp_processor_id();
+ int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL);
+ BUG_ON(notifier_rc != NOTIFY_DONE);
+ __cpu_disable();
++}
++
++static int take_cpu_down(void *arg)
++{
++ _take_cpu_down(arg);
+ return 0;
+ }
+
+@@ -98,7 +103,9 @@ int cpu_down(unsigned int cpu)
+ goto fail;
+ }
+
+- if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
++ if ( unlikely(system_state < SYS_STATE_active) )
++ on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true);
++ else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 )
+ goto fail;
+
+ __cpu_die(cpu);
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch b/emulators/xen-kernel411/files/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch
new file mode 100644
index 000000000000..14f95fac78b5
--- /dev/null
+++ b/emulators/xen-kernel411/files/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch
@@ -0,0 +1,109 @@
+From b53e0defcea1400c03f83d1d5cc30a3b237c8cfe Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 30 Jul 2018 11:22:42 +0200
+Subject: [PATCH 07/42] x86/svm Fixes and cleanup to svm_inject_event()
+
+ * State adjustments (and debug tracing) for #DB/#BP/#PF should not be done
+ for `int $n` instructions. Updates to %cr2 occur even if the exception
+ combines to #DF.
+ * Don't opencode DR_STEP when updating %dr6.
+ * Simplify the logic for calling svm_emul_swint_injection() as in the common
+ case, every condition needs checking.
+ * Fix comments which have become stale as code has moved between components.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+master commit: 8dab867c81ede455009028a9a88edc4ff3b9da88
+master date: 2018-07-17 10:12:40 +0100
+---
+ xen/arch/x86/hvm/svm/svm.c | 41 ++++++++++++++++----------------------
+ 1 file changed, 17 insertions(+), 24 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
+index 165500e3f2..b964c59dad 100644
+--- a/xen/arch/x86/hvm/svm/svm.c
++++ b/xen/arch/x86/hvm/svm/svm.c
+@@ -1432,24 +1432,18 @@ static void svm_inject_event(const struct x86_event *event)
+ * Xen must emulate enough of the event injection to be sure that a
+ * further fault shouldn't occur during delivery. This covers the fact
+ * that hardware doesn't perform DPL checking on injection.
+- *
+- * Also, it accounts for proper positioning of %rip for an event with trap
+- * semantics (where %rip should point after the instruction) which suffers
+- * a fault during injection (at which point %rip should point at the
+- * instruction).
+ */
+ if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION ||
+- (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT ||
+- event->type == X86_EVENTTYPE_SW_EXCEPTION)) )
++ (!cpu_has_svm_nrips && (event->type >= X86_EVENTTYPE_SW_INTERRUPT)) )
+ svm_emul_swint_injection(&_event);
+
+- switch ( _event.vector )
++ switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) )
+ {
+ case TRAP_debug:
+ if ( regs->eflags & X86_EFLAGS_TF )
+ {
+ __restore_debug_registers(vmcb, curr);
+- vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000);
++ vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | DR_STEP);
+ }
+ /* fall through */
+ case TRAP_int3:
+@@ -1459,6 +1453,13 @@ static void svm_inject_event(const struct x86_event *event)
+ domain_pause_for_debugger();
+ return;
+ }
++ break;
++
++ case TRAP_page_fault:
++ ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION);
++ curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
++ vmcb_set_cr2(vmcb, _event.cr2);
++ break;
+ }
+
+ if ( unlikely(eventinj.fields.v) &&
+@@ -1481,13 +1482,9 @@ static void svm_inject_event(const struct x86_event *event)
+ * icebp, software events with trap semantics need emulating, so %rip in
+ * the trap frame points after the instruction.
+ *
+- * The x86 emulator (if requested by the x86_swint_emulate_* choice) will
+- * have performed checks such as presence/dpl/etc and believes that the
+- * event injection will succeed without faulting.
+- *
+- * The x86 emulator will always provide fault semantics for software
+- * events, with _trap.insn_len set appropriately. If the injection
+- * requires emulation, move %rip forwards at this point.
++ * svm_emul_swint_injection() has already confirmed that events with trap
++ * semantics won't fault on injection. Position %rip/NextRIP suitably,
++ * and restrict the event type to what hardware will tolerate.
+ */
+ switch ( _event.type )
+ {
+@@ -1544,16 +1541,12 @@ static void svm_inject_event(const struct x86_event *event)
+ eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode);
+ vmcb->eventinj = eventinj;
+
+- if ( _event.vector == TRAP_page_fault )
+- {
+- curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
+- vmcb_set_cr2(vmcb, _event.cr2);
+- HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2));
+- }
++ if ( _event.vector == TRAP_page_fault &&
++ _event.type == X86_EVENTTYPE_HW_EXCEPTION )
++ HVMTRACE_LONG_2D(PF_INJECT, _event.error_code,
++ TRC_PAR_LONG(_event.cr2));
+ else
+- {
+ HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
+- }
+ }
+
+ static int svm_event_pending(struct vcpu *v)
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch b/emulators/xen-kernel411/files/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch
new file mode 100644
index 000000000000..7985e49b18db
--- /dev/null
+++ b/emulators/xen-kernel411/files/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch
@@ -0,0 +1,55 @@
+From 0a2016ca2fabfe674c311dcfd8e15fec0ba3f7b6 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 30 Jul 2018 11:23:22 +0200
+Subject: [PATCH 08/42] cpupools: fix state when downing a CPU failed
+
+While I've run into the issue with further patches in place which no
+longer guarantee the per-CPU area to start out as all zeros, the
+CPU_DOWN_FAILED processing looks to have the same issue: By not zapping
+the per-CPU cpupool pointer, cpupool_cpu_add()'s (indirect) invocation
+of schedule_cpu_switch() will trigger the "c != old_pool" assertion
+there.
+
+Clearing the field during CPU_DOWN_PREPARE is too early (afaict this
+should not happen before cpu_disable_scheduler()). Clearing it in
+CPU_DEAD and CPU_DOWN_FAILED would be an option, but would take the same
+piece of code twice. Since the field's value shouldn't matter while the
+CPU is offline, simply clear it (implicitly) for CPU_ONLINE and
+CPU_DOWN_FAILED, but only for other than the suspend/resume case (which
+gets specially handled in cpupool_cpu_remove()).
+
+By adjusting the conditional in cpupool_cpu_add() CPU_DOWN_FAILED
+handling in the suspend case should now also be handled better.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Juergen Gross <jgross@suse.com>
+master commit: cb1ae9a27819cea0c5008773c68a7be6f37eb0e5
+master date: 2018-07-19 09:41:55 +0200
+---
+ xen/common/cpupool.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
+index 999839444e..1e8edcbd57 100644
+--- a/xen/common/cpupool.c
++++ b/xen/common/cpupool.c
+@@ -490,7 +490,7 @@ static int cpupool_cpu_add(unsigned int cpu)
+ cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
+ cpumask_set_cpu(cpu, &cpupool_free_cpus);
+
+- if ( system_state == SYS_STATE_resume )
++ if ( system_state == SYS_STATE_suspend || system_state == SYS_STATE_resume )
+ {
+ struct cpupool **c;
+
+@@ -522,6 +522,7 @@ static int cpupool_cpu_add(unsigned int cpu)
+ * (or unplugging would have failed) and that is the default behavior
+ * anyway.
+ */
++ per_cpu(cpupool, cpu) = NULL;
+ ret = cpupool_assign_cpu_locked(cpupool0, cpu);
+ }
+ out:
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch b/emulators/xen-kernel411/files/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch
new file mode 100644
index 000000000000..d12c858a609d
--- /dev/null
+++ b/emulators/xen-kernel411/files/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch
@@ -0,0 +1,121 @@
+From bd51a6424202a5f1cd13dee6614bcb69ecbd2458 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 30 Jul 2018 11:24:01 +0200
+Subject: [PATCH 09/42] x86/AMD: distinguish compute units from hyper-threads
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Fam17 replaces CUs by HTs, which we should reflect accordingly, even if
+the difference is not very big. The most relevant change (requiring some
+code restructuring) is that the topoext feature no longer means there is
+a valid CU ID.
+
+Take the opportunity and convert wrongly plain int variables in
+set_cpu_sibling_map() to unsigned int.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Brian Woods <brian.woods@amd.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 9429b07a0af7f92a5f25e4068e11db881e157495
+master date: 2018-07-19 09:42:42 +0200
+---
+ xen/arch/x86/cpu/amd.c | 16 +++++++++++-----
+ xen/arch/x86/smpboot.c | 32 ++++++++++++++++++++------------
+ 2 files changed, 31 insertions(+), 17 deletions(-)
+
+diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
+index 458a3fe60c..76078b55b2 100644
+--- a/xen/arch/x86/cpu/amd.c
++++ b/xen/arch/x86/cpu/amd.c
+@@ -505,17 +505,23 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+- c->compute_unit_id = ebx & 0xFF;
+ c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1;
++
++ if (c->x86 < 0x17)
++ c->compute_unit_id = ebx & 0xFF;
++ else {
++ c->cpu_core_id = ebx & 0xFF;
++ c->x86_max_cores /= c->x86_num_siblings;
++ }
+ }
+
+ if (opt_cpu_info)
+ printk("CPU %d(%d) -> Processor %d, %s %d\n",
+ cpu, c->x86_max_cores, c->phys_proc_id,
+- cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" :
+- "Core",
+- cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id :
+- c->cpu_core_id);
++ c->compute_unit_id != INVALID_CUID ? "Compute Unit"
++ : "Core",
++ c->compute_unit_id != INVALID_CUID ? c->compute_unit_id
++ : c->cpu_core_id);
+ }
+
+ static void early_init_amd(struct cpuinfo_x86 *c)
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index d4478e6132..78ba73578a 100644
+--- a/xen/arch/x86/smpboot.c
++++ b/xen/arch/x86/smpboot.c
+@@ -234,33 +234,41 @@ static void link_thread_siblings(int cpu1, int cpu2)
+ cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
+ }
+
+-static void set_cpu_sibling_map(int cpu)
++static void set_cpu_sibling_map(unsigned int cpu)
+ {
+- int i;
++ unsigned int i;
+ struct cpuinfo_x86 *c = cpu_data;
+
+ cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
+
+ cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
++ cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu));
++ cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
+
+ if ( c[cpu].x86_num_siblings > 1 )
+ {
+ for_each_cpu ( i, &cpu_sibling_setup_map )
+ {
+- if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) {
+- if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
+- (c[cpu].compute_unit_id == c[i].compute_unit_id) )
++ if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id )
++ continue;
++ if ( c[cpu].compute_unit_id != INVALID_CUID &&
++ c[i].compute_unit_id != INVALID_CUID )
++ {
++ if ( c[cpu].compute_unit_id == c[i].compute_unit_id )
++ link_thread_siblings(cpu, i);
++ }
++ else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID &&
++ c[i].cpu_core_id != XEN_INVALID_CORE_ID )
++ {
++ if ( c[cpu].cpu_core_id == c[i].cpu_core_id )
+ link_thread_siblings(cpu, i);
+- } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
+- (c[cpu].cpu_core_id == c[i].cpu_core_id) ) {
+- link_thread_siblings(cpu, i);
+ }
++ else
++ printk(XENLOG_WARNING
++ "CPU%u: unclear relationship with CPU%u\n",
++ cpu, i);
+ }
+ }
+- else
+- {
+- cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
+- }
+
+ if ( c[cpu].x86_max_cores == 1 )
+ {
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch b/emulators/xen-kernel411/files/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch
new file mode 100644
index 000000000000..04383c6a70c6
--- /dev/null
+++ b/emulators/xen-kernel411/files/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch
@@ -0,0 +1,423 @@
+From 5908b4866b682d9189c36eddf7c898fd95b27ec1 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 30 Jul 2018 11:24:53 +0200
+Subject: [PATCH 10/42] x86: distinguish CPU offlining from CPU removal
+
+In order to be able to service #MC on offlined CPUs, the GDT, IDT,
+stack, and per-CPU data (which includes the TSS) need to be kept
+allocated. They should only be freed upon CPU removal (which we
+currently don't support, so some code is becoming effectively dead for
+the moment).
+
+Note that for now park_offline_cpus doesn't get set to true anywhere -
+this is going to be the subject of a subsequent patch.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 2e6c8f182c9c50129b1c7a620242861e6ad6a9fb
+master date: 2018-07-19 13:43:33 +0100
+---
+ xen/arch/x86/cpu/mcheck/mce.c | 15 ++++++--
+ xen/arch/x86/domain.c | 9 +++--
+ xen/arch/x86/genapic/x2apic.c | 9 +++--
+ xen/arch/x86/percpu.c | 9 +++--
+ xen/arch/x86/smpboot.c | 71 ++++++++++++++++++++++-------------
+ xen/include/asm-x86/smp.h | 2 +
+ xen/include/xen/cpu.h | 2 +
+ xen/include/xen/cpumask.h | 23 ++++++++++++
+ xen/include/xen/mm.h | 8 ++++
+ xen/include/xen/xmalloc.h | 6 +++
+ 10 files changed, 115 insertions(+), 39 deletions(-)
+
+diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
+index a8c287d124..32273d9208 100644
+--- a/xen/arch/x86/cpu/mcheck/mce.c
++++ b/xen/arch/x86/cpu/mcheck/mce.c
+@@ -692,12 +692,15 @@ static void cpu_bank_free(unsigned int cpu)
+
+ mcabanks_free(poll);
+ mcabanks_free(clr);
++
++ per_cpu(poll_bankmask, cpu) = NULL;
++ per_cpu(mce_clear_banks, cpu) = NULL;
+ }
+
+ static int cpu_bank_alloc(unsigned int cpu)
+ {
+- struct mca_banks *poll = mcabanks_alloc();
+- struct mca_banks *clr = mcabanks_alloc();
++ struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc();
++ struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc();
+
+ if ( !poll || !clr )
+ {
+@@ -725,7 +728,13 @@ static int cpu_callback(
+
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+- cpu_bank_free(cpu);
++ if ( !park_offline_cpus )
++ cpu_bank_free(cpu);
++ break;
++
++ case CPU_REMOVE:
++ if ( park_offline_cpus )
++ cpu_bank_free(cpu);
+ break;
+ }
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 9850a782ec..c39cf2c6e5 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -107,10 +107,11 @@ static void play_dead(void)
+ local_irq_disable();
+
+ /*
+- * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
+- * as they may be freed at any time. In this case, heap corruption or
+- * #PF can occur (when heap debugging is enabled). For example, even
+- * printk() can involve tasklet scheduling, which touches per-cpu vars.
++ * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
++ * as they may be freed at any time if offline CPUs don't get parked. In
++ * this case, heap corruption or #PF can occur (when heap debugging is
++ * enabled). For example, even printk() can involve tasklet scheduling,
++ * which touches per-cpu vars.
+ *
+ * Consider very carefully when adding code to *dead_idle. Most hypervisor
+ * subsystems are unsafe to call.
+diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
+index 4779b0d0d5..d997806272 100644
+--- a/xen/arch/x86/genapic/x2apic.c
++++ b/xen/arch/x86/genapic/x2apic.c
+@@ -201,18 +201,21 @@ static int update_clusterinfo(
+ if ( !cluster_cpus_spare )
+ cluster_cpus_spare = xzalloc(cpumask_t);
+ if ( !cluster_cpus_spare ||
+- !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
++ !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
+ err = -ENOMEM;
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
++ case CPU_REMOVE:
++ if ( park_offline_cpus == (action != CPU_REMOVE) )
++ break;
+ if ( per_cpu(cluster_cpus, cpu) )
+ {
+ cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
+ if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
+- xfree(per_cpu(cluster_cpus, cpu));
++ XFREE(per_cpu(cluster_cpus, cpu));
+ }
+- free_cpumask_var(per_cpu(scratch_mask, cpu));
++ FREE_CPUMASK_VAR(per_cpu(scratch_mask, cpu));
+ break;
+ }
+
+diff --git a/xen/arch/x86/percpu.c b/xen/arch/x86/percpu.c
+index c9997b7937..8be4ebddf4 100644
+--- a/xen/arch/x86/percpu.c
++++ b/xen/arch/x86/percpu.c
+@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int cpu)
+ char *p;
+
+ if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA )
+- return -EBUSY;
++ return 0;
+
+ if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL )
+ return -ENOMEM;
+@@ -76,9 +76,12 @@ static int cpu_percpu_callback(
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+- free_percpu_area(cpu);
++ if ( !park_offline_cpus )
++ free_percpu_area(cpu);
+ break;
+- default:
++ case CPU_REMOVE:
++ if ( park_offline_cpus )
++ free_percpu_area(cpu);
+ break;
+ }
+
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index 78ba73578a..7e76cc3d68 100644
+--- a/xen/arch/x86/smpboot.c
++++ b/xen/arch/x86/smpboot.c
+@@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
+ cpumask_t cpu_online_map __read_mostly;
+ EXPORT_SYMBOL(cpu_online_map);
+
++bool __read_mostly park_offline_cpus;
++
+ unsigned int __read_mostly nr_sockets;
+ cpumask_t **__read_mostly socket_cpumask;
+ static cpumask_t *secondary_socket_cpumask;
+@@ -895,7 +897,14 @@ static void cleanup_cpu_root_pgt(unsigned int cpu)
+ }
+ }
+
+-static void cpu_smpboot_free(unsigned int cpu)
++/*
++ * The 'remove' boolean controls whether a CPU is just getting offlined (and
++ * parked), or outright removed / offlined without parking. Parked CPUs need
++ * things like their stack, GDT, IDT, TSS, and per-CPU data still available.
++ * A few other items, in particular CPU masks, are also retained, as it's
++ * difficult to prove that they're entirely unreferenced from parked CPUs.
++ */
++static void cpu_smpboot_free(unsigned int cpu, bool remove)
+ {
+ unsigned int order, socket = cpu_to_socket(cpu);
+ struct cpuinfo_x86 *c = cpu_data;
+@@ -906,15 +915,19 @@ static void cpu_smpboot_free(unsigned int cpu)
+ socket_cpumask[socket] = NULL;
+ }
+
+- c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
+- c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
+- c[cpu].compute_unit_id = INVALID_CUID;
+ cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
+
+- free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
+- free_cpumask_var(per_cpu(cpu_core_mask, cpu));
+- if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
+- free_cpumask_var(per_cpu(scratch_cpumask, cpu));
++ if ( remove )
++ {
++ c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
++ c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
++ c[cpu].compute_unit_id = INVALID_CUID;
++
++ FREE_CPUMASK_VAR(per_cpu(cpu_sibling_mask, cpu));
++ FREE_CPUMASK_VAR(per_cpu(cpu_core_mask, cpu));
++ if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
++ FREE_CPUMASK_VAR(per_cpu(scratch_cpumask, cpu));
++ }
+
+ cleanup_cpu_root_pgt(cpu);
+
+@@ -936,19 +949,21 @@ static void cpu_smpboot_free(unsigned int cpu)
+ }
+
+ order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+- free_xenheap_pages(per_cpu(gdt_table, cpu), order);
++ if ( remove )
++ FREE_XENHEAP_PAGES(per_cpu(gdt_table, cpu), order);
+
+ free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);
+
+- order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
+- free_xenheap_pages(idt_tables[cpu], order);
+- idt_tables[cpu] = NULL;
+-
+- if ( stack_base[cpu] != NULL )
++ if ( remove )
+ {
+- memguard_unguard_stack(stack_base[cpu]);
+- free_xenheap_pages(stack_base[cpu], STACK_ORDER);
+- stack_base[cpu] = NULL;
++ order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
++ FREE_XENHEAP_PAGES(idt_tables[cpu], order);
++
++ if ( stack_base[cpu] )
++ {
++ memguard_unguard_stack(stack_base[cpu]);
++ FREE_XENHEAP_PAGES(stack_base[cpu], STACK_ORDER);
++ }
+ }
+ }
+
+@@ -963,15 +978,17 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+ if ( node != NUMA_NO_NODE )
+ memflags = MEMF_node(node);
+
+- stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
++ if ( stack_base[cpu] == NULL )
++ stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
+ if ( stack_base[cpu] == NULL )
+ goto out;
+ memguard_guard_stack(stack_base[cpu]);
+
+ order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+- per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
++ gdt = per_cpu(gdt_table, cpu) ?: alloc_xenheap_pages(order, memflags);
+ if ( gdt == NULL )
+ goto out;
++ per_cpu(gdt_table, cpu) = gdt;
+ memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+ BUILD_BUG_ON(NR_CPUS > 0x10000);
+ gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+@@ -983,7 +1000,8 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+ gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+
+ order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
+- idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
++ if ( idt_tables[cpu] == NULL )
++ idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
+ if ( idt_tables[cpu] == NULL )
+ goto out;
+ memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
+@@ -1011,16 +1029,16 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+ (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
+ goto out;
+
+- if ( !(zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
+- zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
+- alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
++ if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
++ cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
++ cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
+ goto out;
+
+ rc = 0;
+
+ out:
+ if ( rc )
+- cpu_smpboot_free(cpu);
++ cpu_smpboot_free(cpu, true);
+
+ return rc;
+ }
+@@ -1038,9 +1056,10 @@ static int cpu_smpboot_callback(
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+- cpu_smpboot_free(cpu);
++ cpu_smpboot_free(cpu, !park_offline_cpus);
+ break;
+- default:
++ case CPU_REMOVE:
++ cpu_smpboot_free(cpu, true);
+ break;
+ }
+
+diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
+index 4e5f673fec..09c55458df 100644
+--- a/xen/include/asm-x86/smp.h
++++ b/xen/include/asm-x86/smp.h
+@@ -26,6 +26,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
+ DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
+ DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
+
++extern bool park_offline_cpus;
++
+ void smp_send_nmi_allbutself(void);
+
+ void send_IPI_mask(const cpumask_t *, int vector);
+diff --git a/xen/include/xen/cpu.h b/xen/include/xen/cpu.h
+index ffefc09f8e..2fe3ec05d8 100644
+--- a/xen/include/xen/cpu.h
++++ b/xen/include/xen/cpu.h
+@@ -47,6 +47,8 @@ void register_cpu_notifier(struct notifier_block *nb);
+ #define CPU_DYING (0x0007 | NOTIFY_REVERSE)
+ /* CPU_DEAD: CPU is dead. */
+ #define CPU_DEAD (0x0008 | NOTIFY_REVERSE)
++/* CPU_REMOVE: CPU was removed. */
++#define CPU_REMOVE (0x0009 | NOTIFY_REVERSE)
+
+ /* Perform CPU hotplug. May return -EAGAIN. */
+ int cpu_down(unsigned int cpu);
+diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h
+index 42340a098e..4a11bcc3f3 100644
+--- a/xen/include/xen/cpumask.h
++++ b/xen/include/xen/cpumask.h
+@@ -351,16 +351,35 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
+ return *mask != NULL;
+ }
+
++static inline bool cond_alloc_cpumask_var(cpumask_var_t *mask)
++{
++ if (*mask == NULL)
++ *mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long));
++ return *mask != NULL;
++}
++
+ static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
+ {
+ *(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
+ return *mask != NULL;
+ }
+
++static inline bool cond_zalloc_cpumask_var(cpumask_var_t *mask)
++{
++ if (*mask == NULL)
++ *mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
++ else
++ cpumask_clear(*mask);
++ return *mask != NULL;
++}
++
+ static inline void free_cpumask_var(cpumask_var_t mask)
+ {
+ xfree(mask);
+ }
++
++/* Free an allocated mask, and zero the pointer to it. */
++#define FREE_CPUMASK_VAR(m) XFREE(m)
+ #else
+ typedef cpumask_t cpumask_var_t[1];
+
+@@ -368,16 +387,20 @@ static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
+ {
+ return 1;
+ }
++#define cond_alloc_cpumask_var alloc_cpumask_var
+
+ static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
+ {
+ cpumask_clear(*mask);
+ return 1;
+ }
++#define cond_zalloc_cpumask_var zalloc_cpumask_var
+
+ static inline void free_cpumask_var(cpumask_var_t mask)
+ {
+ }
++
++#define FREE_CPUMASK_VAR(m) free_cpumask_var(m)
+ #endif
+
+ #if NR_CPUS > 1
+diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
+index e928551c91..24654e8e22 100644
+--- a/xen/include/xen/mm.h
++++ b/xen/include/xen/mm.h
+@@ -162,6 +162,14 @@ void free_xenheap_pages(void *v, unsigned int order);
+ bool scrub_free_pages(void);
+ #define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
+ #define free_xenheap_page(v) (free_xenheap_pages(v,0))
++
++/* Free an allocation, and zero the pointer to it. */
++#define FREE_XENHEAP_PAGES(p, o) do { \
++ free_xenheap_pages(p, o); \
++ (p) = NULL; \
++} while ( false )
++#define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0)
++
+ /* Map machine page range in Xen virtual address space. */
+ int map_pages_to_xen(
+ unsigned long virt,
+diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h
+index cc2673d8ae..9aa5edf593 100644
+--- a/xen/include/xen/xmalloc.h
++++ b/xen/include/xen/xmalloc.h
+@@ -26,6 +26,12 @@
+ /* Free any of the above. */
+ extern void xfree(void *);
+
++/* Free an allocation, and zero the pointer to it. */
++#define XFREE(p) do { \
++ xfree(p); \
++ (p) = NULL; \
++} while ( false )
++
+ /* Underlying functions */
+ extern void *_xmalloc(unsigned long size, unsigned long align);
+ extern void *_xzalloc(unsigned long size, unsigned long align);
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0011-x86-possibly-bring-up-all-CPUs-even-if-not-all-are-s.patch b/emulators/xen-kernel411/files/0011-x86-possibly-bring-up-all-CPUs-even-if-not-all-are-s.patch
new file mode 100644
index 000000000000..136db1d12c75
--- /dev/null
+++ b/emulators/xen-kernel411/files/0011-x86-possibly-bring-up-all-CPUs-even-if-not-all-are-s.patch
@@ -0,0 +1,174 @@
+From 75313e478e894176056e1fc5852136b344a0dc70 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 30 Jul 2018 11:25:38 +0200
+Subject: [PATCH 11/42] x86: possibly bring up all CPUs even if not all are
+ supposed to be used
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Reportedly Intel CPUs which can't broadcast #MC to all targeted
+cores/threads because some have CR4.MCE clear will shut down. Therefore
+we want to keep CR4.MCE enabled when offlining a CPU, and we need to
+bring up all CPUs in order to be able to set CR4.MCE in the first place.
+
+The use of clear_in_cr4() in cpu_mcheck_disable() was ill advised
+anyway, and to avoid future similar mistakes I'm removing clear_in_cr4()
+altogether right here.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+master commit: 8797d20a6ec2dd75195585a107ce345c51c0a59a
+master date: 2018-07-19 13:43:33 +0100
+---
+ xen/arch/x86/cpu/common.c | 4 ++++
+ xen/arch/x86/cpu/mcheck/mce_intel.c | 2 --
+ xen/arch/x86/mpparse.c | 15 +++++++++++----
+ xen/arch/x86/setup.c | 18 +++++++++++++++---
+ xen/include/asm-x86/processor.h | 6 ------
+ 5 files changed, 30 insertions(+), 15 deletions(-)
+
+diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
+index 528aff1811..fdb022875a 100644
+--- a/xen/arch/x86/cpu/common.c
++++ b/xen/arch/x86/cpu/common.c
+@@ -14,6 +14,7 @@
+ #include <public/sysctl.h> /* for XEN_INVALID_{SOCKET,CORE}_ID */
+
+ #include "cpu.h"
++#include "mcheck/x86_mca.h"
+
+ bool_t opt_arat = 1;
+ boolean_param("arat", opt_arat);
+@@ -355,6 +356,9 @@ static void __init early_cpu_detect(void)
+ hap_paddr_bits = PADDR_BITS;
+ }
+
++ if (c->x86_vendor != X86_VENDOR_AMD)
++ park_offline_cpus = opt_mce;
++
+ initialize_cpu_data(0);
+ }
+
+diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
+index e5dd956a24..4474a34e34 100644
+--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
++++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
+@@ -636,8 +636,6 @@ static void clear_cmci(void)
+
+ static void cpu_mcheck_disable(void)
+ {
+- clear_in_cr4(X86_CR4_MCE);
+-
+ if ( cmci_support && opt_mce )
+ clear_cmci();
+ }
+diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
+index 49140e46f0..f3f6d48668 100644
+--- a/xen/arch/x86/mpparse.c
++++ b/xen/arch/x86/mpparse.c
+@@ -68,19 +68,26 @@ physid_mask_t phys_cpu_present_map;
+
+ void __init set_nr_cpu_ids(unsigned int max_cpus)
+ {
++ unsigned int tot_cpus = num_processors + disabled_cpus;
++
+ if (!max_cpus)
+- max_cpus = num_processors + disabled_cpus;
++ max_cpus = tot_cpus;
+ if (max_cpus > NR_CPUS)
+ max_cpus = NR_CPUS;
+ else if (!max_cpus)
+ max_cpus = 1;
+ printk(XENLOG_INFO "SMP: Allowing %u CPUs (%d hotplug CPUs)\n",
+ max_cpus, max_t(int, max_cpus - num_processors, 0));
+- nr_cpu_ids = max_cpus;
++
++ if (!park_offline_cpus)
++ tot_cpus = max_cpus;
++ nr_cpu_ids = min(tot_cpus, NR_CPUS + 0u);
++ if (park_offline_cpus && nr_cpu_ids < num_processors)
++ printk(XENLOG_WARNING "SMP: Cannot bring up %u further CPUs\n",
++ num_processors - nr_cpu_ids);
+
+ #ifndef nr_cpumask_bits
+- nr_cpumask_bits = (max_cpus + (BITS_PER_LONG - 1)) &
+- ~(BITS_PER_LONG - 1);
++ nr_cpumask_bits = ROUNDUP(nr_cpu_ids, BITS_PER_LONG);
+ printk(XENLOG_DEBUG "NR_CPUS:%u nr_cpumask_bits:%u\n",
+ NR_CPUS, nr_cpumask_bits);
+ #endif
+diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
+index a3172ca92c..984c948216 100644
+--- a/xen/arch/x86/setup.c
++++ b/xen/arch/x86/setup.c
+@@ -665,7 +665,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ {
+ char *memmap_type = NULL;
+ char *cmdline, *kextra, *loader;
+- unsigned int initrdidx;
++ unsigned int initrdidx, num_parked = 0;
+ multiboot_info_t *mbi;
+ module_t *mod;
+ unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
+@@ -1494,7 +1494,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ else
+ {
+ set_nr_cpu_ids(max_cpus);
+- max_cpus = nr_cpu_ids;
++ if ( !max_cpus )
++ max_cpus = nr_cpu_ids;
+ }
+
+ if ( xen_guest )
+@@ -1617,16 +1618,27 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ /* Set up node_to_cpumask based on cpu_to_node[]. */
+ numa_add_cpu(i);
+
+- if ( (num_online_cpus() < max_cpus) && !cpu_online(i) )
++ if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
++ !cpu_online(i) )
+ {
+ int ret = cpu_up(i);
+ if ( ret != 0 )
+ printk("Failed to bring up CPU %u (error %d)\n", i, ret);
++ else if ( num_online_cpus() > max_cpus )
++ {
++ ret = cpu_down(i);
++ if ( !ret )
++ ++num_parked;
++ else
++ printk("Could not re-offline CPU%u (%d)\n", i, ret);
++ }
+ }
+ }
+ }
+
+ printk("Brought up %ld CPUs\n", (long)num_online_cpus());
++ if ( num_parked )
++ printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
+ smp_cpus_done();
+
+ do_initcalls();
+diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
+index 9924cdf1f3..2bd9e69684 100644
+--- a/xen/include/asm-x86/processor.h
++++ b/xen/include/asm-x86/processor.h
+@@ -337,12 +337,6 @@ static always_inline void set_in_cr4 (unsigned long mask)
+ write_cr4(read_cr4() | mask);
+ }
+
+-static always_inline void clear_in_cr4 (unsigned long mask)
+-{
+- mmu_cr4_features &= ~mask;
+- write_cr4(read_cr4() & ~mask);
+-}
+-
+ static inline unsigned int read_pkru(void)
+ {
+ unsigned int pkru;
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0012-x86-command-line-option-to-avoid-use-of-secondary-hy.patch b/emulators/xen-kernel411/files/0012-x86-command-line-option-to-avoid-use-of-secondary-hy.patch
new file mode 100644
index 000000000000..9c109294974d
--- /dev/null
+++ b/emulators/xen-kernel411/files/0012-x86-command-line-option-to-avoid-use-of-secondary-hy.patch
@@ -0,0 +1,126 @@
+From 353edf12c865d2a1e24173aac841452b90614915 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 30 Jul 2018 11:26:16 +0200
+Subject: [PATCH 12/42] x86: command line option to avoid use of secondary
+ hyper-threads
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Shared resources (L1 cache and TLB in particular) present a risk of
+information leak via side channels. Provide a means to avoid use of
+hyperthreads in such cases.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: d8f974f1a646c0200b97ebcabb808324b288fadb
+master date: 2018-07-19 13:43:33 +0100
+---
+ docs/misc/xen-command-line.markdown | 7 +++++++
+ xen/arch/x86/setup.c | 8 +++++++-
+ xen/arch/x86/sysctl.c | 16 +++++++++++++++-
+ xen/include/asm-x86/setup.h | 2 ++
+ 4 files changed, 31 insertions(+), 2 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 075e5ea159..3b710b71fb 100644
+--- a/docs/misc/xen-command-line.markdown
++++ b/docs/misc/xen-command-line.markdown
+@@ -1748,6 +1748,13 @@ Use `smap=hvm` to allow SMAP use by HVM guests only.
+ Flag to enable Supervisor Mode Execution Protection
+ Use `smep=hvm` to allow SMEP use by HVM guests only.
+
++### smt (x86)
++> `= <boolean>`
++
++Default: `true`
++
++Control bring up of multiple hyper-threads per CPU core.
++
+ ### snb\_igd\_quirk
+ > `= <boolean> | cap | <integer>`
+
+diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
+index 984c948216..66fd13f93a 100644
+--- a/xen/arch/x86/setup.c
++++ b/xen/arch/x86/setup.c
+@@ -62,6 +62,9 @@ boolean_param("nosmp", opt_nosmp);
+ static unsigned int __initdata max_cpus;
+ integer_param("maxcpus", max_cpus);
+
++int8_t __read_mostly opt_smt = -1;
++boolean_param("smt", opt_smt);
++
+ /* opt_invpcid: If false, don't use INVPCID instruction even if available. */
+ static bool __initdata opt_invpcid = true;
+ boolean_param("invpcid", opt_invpcid);
+@@ -1624,7 +1627,10 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ int ret = cpu_up(i);
+ if ( ret != 0 )
+ printk("Failed to bring up CPU %u (error %d)\n", i, ret);
+- else if ( num_online_cpus() > max_cpus )
++ else if ( num_online_cpus() > max_cpus ||
++ (!opt_smt &&
++ cpu_data[i].compute_unit_id == INVALID_CUID &&
++ cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) )
+ {
+ ret = cpu_down(i);
+ if ( !ret )
+diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
+index 4d372db12b..e704ed7f1c 100644
+--- a/xen/arch/x86/sysctl.c
++++ b/xen/arch/x86/sysctl.c
+@@ -23,6 +23,7 @@
+ #include <asm/hvm/hvm.h>
+ #include <asm/hvm/support.h>
+ #include <asm/processor.h>
++#include <asm/setup.h>
+ #include <asm/smp.h>
+ #include <asm/numa.h>
+ #include <xen/nodemask.h>
+@@ -48,14 +49,27 @@ static void l3_cache_get(void *arg)
+
+ long cpu_up_helper(void *data)
+ {
+- int cpu = (unsigned long)data;
++ unsigned int cpu = (unsigned long)data;
+ int ret = cpu_up(cpu);
++
+ if ( ret == -EBUSY )
+ {
+ /* On EBUSY, flush RCU work and have one more go. */
+ rcu_barrier();
+ ret = cpu_up(cpu);
+ }
++
++ if ( !ret && !opt_smt &&
++ cpu_data[cpu].compute_unit_id == INVALID_CUID &&
++ cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) > 1 )
++ {
++ ret = cpu_down_helper(data);
++ if ( ret )
++ printk("Could not re-offline CPU%u (%d)\n", cpu, ret);
++ else
++ ret = -EPERM;
++ }
++
+ return ret;
+ }
+
+diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
+index 19232afa01..c09a5ff381 100644
+--- a/xen/include/asm-x86/setup.h
++++ b/xen/include/asm-x86/setup.h
+@@ -66,6 +66,8 @@ extern uint8_t kbd_shift_flags;
+ extern unsigned long highmem_start;
+ #endif
+
++extern int8_t opt_smt;
++
+ #ifdef CONFIG_SHADOW_PAGING
+ extern bool opt_dom0_shadow;
+ #else
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0013-x86-vmx-Don-t-clobber-dr6-while-debugging-state-is-l.patch b/emulators/xen-kernel411/files/0013-x86-vmx-Don-t-clobber-dr6-while-debugging-state-is-l.patch
new file mode 100644
index 000000000000..516077fb9731
--- /dev/null
+++ b/emulators/xen-kernel411/files/0013-x86-vmx-Don-t-clobber-dr6-while-debugging-state-is-l.patch
@@ -0,0 +1,38 @@
+From 037fe82cf5fadf0f74c3da70560ee7592a8f2083 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 30 Jul 2018 11:26:53 +0200
+Subject: [PATCH 13/42] x86/vmx: Don't clobber %dr6 while debugging state is
+ lazy
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+c/s 4f36452b63 introduced a write to %dr6 in the #DB intercept case, but the
+guests debug registers may be lazy at this point, at which point the guests
+later attempt to read %dr6 will discard this value and use the older stale
+value.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+master commit: 3cdac2805692c7accde2f405d81cc0be799aee48
+master date: 2018-07-19 14:06:48 +0100
+---
+ xen/arch/x86/hvm/vmx/vmx.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 610c8d6eb9..7189820bfc 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -3701,6 +3701,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
+ */
+ __vmread(EXIT_QUALIFICATION, &exit_qualification);
+ HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
++ __restore_debug_registers(v);
+ write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
+ if ( !v->domain->debugger_attached )
+ {
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0014-x86-xstate-Use-a-guests-CPUID-policy-rather-than-all.patch b/emulators/xen-kernel411/files/0014-x86-xstate-Use-a-guests-CPUID-policy-rather-than-all.patch
new file mode 100644
index 000000000000..da3c464aea38
--- /dev/null
+++ b/emulators/xen-kernel411/files/0014-x86-xstate-Use-a-guests-CPUID-policy-rather-than-all.patch
@@ -0,0 +1,125 @@
+From 543027c9842d8416047ef38846d2de1295052e92 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 30 Jul 2018 11:27:33 +0200
+Subject: [PATCH 14/42] x86/xstate: Use a guests CPUID policy, rather than
+ allowing all features
+
+It turns out that Xen has never enforced that a domain remain within the
+xstate features advertised in CPUID.
+
+The check of new_bv against xfeature_mask ensures that a domain stays within
+the set of features that Xen has enabled in hardware (and therefore isn't a
+security problem), but this does means that attempts to level a guest for
+migration safety might not be effective if the guest ignores CPUID.
+
+Check the CPUID policy in validate_xstate() (for incoming migration) and in
+handle_xsetbv() (for guest XSETBV instructions). This subsumes the PKRU check
+for PV guests in handle_xsetbv() (and also demonstrates that I should have
+spotted this problem while reviewing c/s fbf9971241f).
+
+For migration, this is correct despite the current (mis)ordering of data
+because d->arch.cpuid is the applicable max policy.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 361b835fa00d9f45167c50a60e054ccf22c065d7
+master date: 2018-07-19 19:57:26 +0100
+---
+ xen/arch/x86/domctl.c | 2 +-
+ xen/arch/x86/hvm/hvm.c | 2 +-
+ xen/arch/x86/xstate.c | 17 +++++++++++------
+ xen/include/asm-x86/xstate.h | 5 +++--
+ 4 files changed, 16 insertions(+), 10 deletions(-)
+
+diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
+index b04388d663..fa82b6744e 100644
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -1163,7 +1163,7 @@ long arch_do_domctl(
+ if ( _xcr0_accum )
+ {
+ if ( evc->size >= PV_XSAVE_HDR_SIZE + XSTATE_AREA_MIN_SIZE )
+- ret = validate_xstate(_xcr0, _xcr0_accum,
++ ret = validate_xstate(d, _xcr0, _xcr0_accum,
+ &_xsave_area->xsave_hdr);
+ }
+ else if ( !_xcr0 )
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index 279cb88e45..d544720876 100644
+--- a/xen/arch/x86/hvm/hvm.c
++++ b/xen/arch/x86/hvm/hvm.c
+@@ -1269,7 +1269,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h)
+ ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
+ h->cur += desc->length;
+
+- err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum,
++ err = validate_xstate(d, ctxt->xcr0, ctxt->xcr0_accum,
+ (const void *)&ctxt->save_area.xsave_hdr);
+ if ( err )
+ {
+diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
+index b4aea4b50a..1fbb0871d0 100644
+--- a/xen/arch/x86/xstate.c
++++ b/xen/arch/x86/xstate.c
+@@ -670,12 +670,17 @@ static bool valid_xcr0(u64 xcr0)
+ return !(xcr0 & X86_XCR0_BNDREGS) == !(xcr0 & X86_XCR0_BNDCSR);
+ }
+
+-int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
++int validate_xstate(const struct domain *d, uint64_t xcr0, uint64_t xcr0_accum,
++ const struct xsave_hdr *hdr)
+ {
++ const struct cpuid_policy *cp = d->arch.cpuid;
++ uint64_t xcr0_max =
++ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
+ unsigned int i;
+
+ if ( (hdr->xstate_bv & ~xcr0_accum) ||
+ (xcr0 & ~xcr0_accum) ||
++ (xcr0_accum & ~xcr0_max) ||
+ !valid_xcr0(xcr0) ||
+ !valid_xcr0(xcr0_accum) )
+ return -EINVAL;
+@@ -694,18 +699,18 @@ int validate_xstate(u64 xcr0, u64 xcr0_accum, const struct xsave_hdr *hdr)
+ int handle_xsetbv(u32 index, u64 new_bv)
+ {
+ struct vcpu *curr = current;
++ const struct cpuid_policy *cp = curr->domain->arch.cpuid;
++ uint64_t xcr0_max =
++ ((uint64_t)cp->xstate.xcr0_high << 32) | cp->xstate.xcr0_low;
+ u64 mask;
+
+ if ( index != XCR_XFEATURE_ENABLED_MASK )
+ return -EOPNOTSUPP;
+
+- if ( (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
++ if ( (new_bv & ~xcr0_max) ||
++ (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
+ return -EINVAL;
+
+- /* XCR0.PKRU is disabled on PV mode. */
+- if ( is_pv_vcpu(curr) && (new_bv & X86_XCR0_PKRU) )
+- return -EOPNOTSUPP;
+-
+ if ( !set_xcr0(new_bv) )
+ return -EFAULT;
+
+diff --git a/xen/include/asm-x86/xstate.h b/xen/include/asm-x86/xstate.h
+index 86a4a1f75c..47f602b855 100644
+--- a/xen/include/asm-x86/xstate.h
++++ b/xen/include/asm-x86/xstate.h
+@@ -97,8 +97,9 @@ void xsave(struct vcpu *v, uint64_t mask);
+ void xrstor(struct vcpu *v, uint64_t mask);
+ void xstate_set_init(uint64_t mask);
+ bool xsave_enabled(const struct vcpu *v);
+-int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum,
+- const struct xsave_hdr *);
++int __must_check validate_xstate(const struct domain *d,
++ uint64_t xcr0, uint64_t xcr0_accum,
++ const struct xsave_hdr *hdr);
+ int __must_check handle_xsetbv(u32 index, u64 new_bv);
+ void expand_xsave_states(struct vcpu *v, void *dest, unsigned int size);
+ void compress_xsave_states(struct vcpu *v, const void *src, unsigned int size);
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0015-x86-xstate-Make-errors-in-xstate-calculations-more-o.patch b/emulators/xen-kernel411/files/0015-x86-xstate-Make-errors-in-xstate-calculations-more-o.patch
new file mode 100644
index 000000000000..134f5114bf31
--- /dev/null
+++ b/emulators/xen-kernel411/files/0015-x86-xstate-Make-errors-in-xstate-calculations-more-o.patch
@@ -0,0 +1,64 @@
+From 06d2a763d07d53a4ccc7bd1255ffc9ea01ec1609 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 30 Jul 2018 11:29:00 +0200
+Subject: [PATCH 15/42] x86/xstate: Make errors in xstate calculations more
+ obvious by crashing the domain
+
+If xcr0_max exceeds xfeature_mask, then something is broken with the CPUID
+policy derivation or auditing logic. If hardware rejects new_bv, then
+something is broken with Xen's xstate logic.
+
+In both cases, crash the domain with an obvious error message, to help
+highlight the issues.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: d6371ccb93012db4ad6615fe666205b86308cb4e
+master date: 2018-07-19 19:57:26 +0100
+---
+ xen/arch/x86/xstate.c | 26 +++++++++++++++++++++++---
+ 1 file changed, 23 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
+index 1fbb0871d0..15edd5df96 100644
+--- a/xen/arch/x86/xstate.c
++++ b/xen/arch/x86/xstate.c
+@@ -707,12 +707,32 @@ int handle_xsetbv(u32 index, u64 new_bv)
+ if ( index != XCR_XFEATURE_ENABLED_MASK )
+ return -EOPNOTSUPP;
+
+- if ( (new_bv & ~xcr0_max) ||
+- (new_bv & ~xfeature_mask) || !valid_xcr0(new_bv) )
++ /*
++ * The CPUID logic shouldn't be able to hand out an XCR0 exceeding Xen's
++ * maximum features, but keep the check for robustness.
++ */
++ if ( unlikely(xcr0_max & ~xfeature_mask) )
++ {
++ gprintk(XENLOG_ERR,
++ "xcr0_max %016" PRIx64 " exceeds hardware max %016" PRIx64 "\n",
++ xcr0_max, xfeature_mask);
++ domain_crash(curr->domain);
++
++ return -EINVAL;
++ }
++
++ if ( (new_bv & ~xcr0_max) || !valid_xcr0(new_bv) )
+ return -EINVAL;
+
+- if ( !set_xcr0(new_bv) )
++ /* By this point, new_bv really should be accepted by hardware. */
++ if ( unlikely(!set_xcr0(new_bv)) )
++ {
++ gprintk(XENLOG_ERR, "new_bv %016" PRIx64 " rejected by hardware\n",
++ new_bv);
++ domain_crash(curr->domain);
++
+ return -EFAULT;
++ }
+
+ mask = new_bv & ~curr->arch.xcr0_accum;
+ curr->arch.xcr0 = new_bv;
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0016-x86-hvm-Disallow-unknown-MSR_EFER-bits.patch b/emulators/xen-kernel411/files/0016-x86-hvm-Disallow-unknown-MSR_EFER-bits.patch
new file mode 100644
index 000000000000..e955485b8d73
--- /dev/null
+++ b/emulators/xen-kernel411/files/0016-x86-hvm-Disallow-unknown-MSR_EFER-bits.patch
@@ -0,0 +1,48 @@
+From 7de21555730367497eb01edf6e9e9530224105e7 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 30 Jul 2018 11:29:39 +0200
+Subject: [PATCH 16/42] x86/hvm: Disallow unknown MSR_EFER bits
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+It turns out that nothing ever prevented HVM guests from trying to set unknown
+EFER bits. Generally, this results in a vmentry failure.
+
+For Intel hardware, all implemented bits are covered by the checks.
+
+For AMD hardware, the only EFER bit which isn't covered by the checks is TCE
+(which AFAICT is specific to AMD Fam15/16 hardware). We never advertise TCE
+in CPUID, but it isn't a security problem to have TCE unexpected enabled in
+guest context.
+
+Disallow the setting of bits outside of the EFER_KNOWN_MASK, which prevents
+any vmentry failures for guests, yielding #GP instead.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: ef0269c6215d642a709866f04ba1a1f9f13f3614
+master date: 2018-07-24 11:25:53 +0100
+---
+ xen/arch/x86/hvm/hvm.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index d544720876..4cbb688c05 100644
+--- a/xen/arch/x86/hvm/hvm.c
++++ b/xen/arch/x86/hvm/hvm.c
+@@ -907,6 +907,9 @@ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value,
+ else
+ p = &host_cpuid_policy;
+
++ if ( value & ~EFER_KNOWN_MASK )
++ return "Unknown bits set";
++
+ if ( (value & EFER_SCE) && !p->extd.syscall )
+ return "SCE without feature";
+
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0017-x86-spec-ctrl-Fix-the-parsing-of-xpti-on-fixed-Intel.patch b/emulators/xen-kernel411/files/0017-x86-spec-ctrl-Fix-the-parsing-of-xpti-on-fixed-Intel.patch
new file mode 100644
index 000000000000..bfdb9909cf09
--- /dev/null
+++ b/emulators/xen-kernel411/files/0017-x86-spec-ctrl-Fix-the-parsing-of-xpti-on-fixed-Intel.patch
@@ -0,0 +1,83 @@
+From 33ced725e11af4eabd3334d12f53ed807e9e2586 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 30 Jul 2018 11:30:09 +0200
+Subject: [PATCH 17/42] x86/spec-ctrl: Fix the parsing of xpti= on fixed Intel
+ hardware
+
+The calls to xpti_init_default() in parse_xpti() are buggy. The CPUID data
+hasn't been fetched that early, and boot_cpu_has(X86_FEATURE_ARCH_CAPS) will
+always evaluate false.
+
+As a result, the default case won't disable XPTI on Intel hardware which
+advertises ARCH_CAPABILITIES_RDCL_NO.
+
+Simplify parse_xpti() to solely the setting of opt_xpti according to the
+passed string, and have init_speculation_mitigations() call
+xpti_init_default() if appropiate. Drop the force parameter, and pass caps
+instead, to avoid redundant re-reading of MSR_ARCH_CAPS.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: be5e2ff6f54e0245331ed360b8786760f82fd673
+master date: 2018-07-24 11:25:54 +0100
+---
+ xen/arch/x86/spec_ctrl.c | 17 +++++------------
+ 1 file changed, 5 insertions(+), 12 deletions(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 73dc7170c7..32a4ea6e99 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -423,17 +423,10 @@ static bool __init should_use_eager_fpu(void)
+ #define OPT_XPTI_DEFAULT 0xff
+ uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT;
+
+-static __init void xpti_init_default(bool force)
++static __init void xpti_init_default(uint64_t caps)
+ {
+- uint64_t caps = 0;
+-
+- if ( !force && (opt_xpti != OPT_XPTI_DEFAULT) )
+- return;
+-
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+ caps = ARCH_CAPABILITIES_RDCL_NO;
+- else if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
+- rdmsrl(MSR_ARCH_CAPABILITIES, caps);
+
+ if ( caps & ARCH_CAPABILITIES_RDCL_NO )
+ opt_xpti = 0;
+@@ -446,8 +439,6 @@ static __init int parse_xpti(const char *s)
+ const char *ss;
+ int val, rc = 0;
+
+- xpti_init_default(false);
+-
+ do {
+ ss = strchr(s, ',');
+ if ( !ss )
+@@ -465,7 +456,7 @@ static __init int parse_xpti(const char *s)
+
+ default:
+ if ( !strcmp(s, "default") )
+- xpti_init_default(true);
++ opt_xpti = OPT_XPTI_DEFAULT;
+ else if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
+ opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) |
+ (val ? OPT_XPTI_DOM0 : 0);
+@@ -627,7 +618,9 @@ void __init init_speculation_mitigations(void)
+ if ( default_xen_spec_ctrl )
+ setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);
+
+- xpti_init_default(false);
++ if ( opt_xpti == OPT_XPTI_DEFAULT )
++ xpti_init_default(caps);
++
+ if ( opt_xpti == 0 )
+ setup_force_cpu_cap(X86_FEATURE_NO_XPTI);
+ else
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0018-x86-spec-ctrl-Yet-more-fixes-for-xpti-parsing.patch b/emulators/xen-kernel411/files/0018-x86-spec-ctrl-Yet-more-fixes-for-xpti-parsing.patch
new file mode 100644
index 000000000000..dc42a9c44176
--- /dev/null
+++ b/emulators/xen-kernel411/files/0018-x86-spec-ctrl-Yet-more-fixes-for-xpti-parsing.patch
@@ -0,0 +1,89 @@
+From 6fe9726aebc11433083b9810402501f1a71d02fd Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 9 Aug 2018 17:22:17 +0100
+Subject: [PATCH 18/42] x86/spec-ctrl: Yet more fixes for xpti= parsing
+
+As it currently stands, 'xpti=dom0' is indistinguishable from the default
+value, which means it will be overridden by ARCH_CAPABILITIES_RDCL_NO on fixed
+hardware.
+
+Switch opt_xpti to use -1 as a default like all our other related options, and
+clobber it as soon as we have a string to parse.
+
+In addition, 'xpti' alone should be interpreted in its positive boolean form,
+rather than resulting in a parse error.
+
+ (XEN) parameter "xpti" has invalid value "", rc=-22!
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 2a3b34ec47817048ab59586855cf0709fc77487e)
+---
+ xen/arch/x86/spec_ctrl.c | 15 +++++++++++----
+ xen/include/asm-x86/spec_ctrl.h | 2 +-
+ 2 files changed, 12 insertions(+), 5 deletions(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 32a4ea6e99..32213ace86 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -420,8 +420,7 @@ static bool __init should_use_eager_fpu(void)
+ }
+ }
+
+-#define OPT_XPTI_DEFAULT 0xff
+-uint8_t __read_mostly opt_xpti = OPT_XPTI_DEFAULT;
++int8_t __read_mostly opt_xpti = -1;
+
+ static __init void xpti_init_default(uint64_t caps)
+ {
+@@ -439,6 +438,14 @@ static __init int parse_xpti(const char *s)
+ const char *ss;
+ int val, rc = 0;
+
++ /* Inhibit the defaults as an explicit choice has been given. */
++ if ( opt_xpti == -1 )
++ opt_xpti = 0;
++
++ /* Interpret 'xpti' alone in its positive boolean form. */
++ if ( *s == '\0' )
++ opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
++
+ do {
+ ss = strchr(s, ',');
+ if ( !ss )
+@@ -456,7 +463,7 @@ static __init int parse_xpti(const char *s)
+
+ default:
+ if ( !strcmp(s, "default") )
+- opt_xpti = OPT_XPTI_DEFAULT;
++ opt_xpti = -1;
+ else if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
+ opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) |
+ (val ? OPT_XPTI_DOM0 : 0);
+@@ -618,7 +625,7 @@ void __init init_speculation_mitigations(void)
+ if ( default_xen_spec_ctrl )
+ setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);
+
+- if ( opt_xpti == OPT_XPTI_DEFAULT )
++ if ( opt_xpti == -1 )
+ xpti_init_default(caps);
+
+ if ( opt_xpti == 0 )
+diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
+index 5b40afbab0..fea82603ca 100644
+--- a/xen/include/asm-x86/spec_ctrl.h
++++ b/xen/include/asm-x86/spec_ctrl.h
+@@ -34,7 +34,7 @@ extern bool bsp_delay_spec_ctrl;
+ extern uint8_t default_xen_spec_ctrl;
+ extern uint8_t default_spec_ctrl_flags;
+
+-extern uint8_t opt_xpti;
++extern int8_t opt_xpti;
+ #define OPT_XPTI_DOM0 0x01
+ #define OPT_XPTI_DOMU 0x02
+
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0019-x86-vmx-Fix-handing-of-MSR_DEBUGCTL-on-VMExit.patch b/emulators/xen-kernel411/files/0019-x86-vmx-Fix-handing-of-MSR_DEBUGCTL-on-VMExit.patch
new file mode 100644
index 000000000000..b8f97f138824
--- /dev/null
+++ b/emulators/xen-kernel411/files/0019-x86-vmx-Fix-handing-of-MSR_DEBUGCTL-on-VMExit.patch
@@ -0,0 +1,281 @@
+From 4254e9874006cc2641b67d0531a3a65374f34c35 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 24 May 2018 17:20:09 +0000
+Subject: [PATCH 19/42] x86/vmx: Fix handing of MSR_DEBUGCTL on VMExit
+
+Currently, whenever the guest writes a nonzero value to MSR_DEBUGCTL, Xen
+updates a host MSR load list entry with the current hardware value of
+MSR_DEBUGCTL.
+
+On VMExit, hardware automatically resets MSR_DEBUGCTL to 0. Later, when the
+guest writes to MSR_DEBUGCTL, the current value in hardware (0) is fed back
+into guest load list. As a practical result, `ler` debugging gets lost on any
+PCPU which has ever scheduled an HVM vcpu, and the common case when `ler`
+debugging isn't active, guest actions result in an unnecessary load list entry
+repeating the MSR_DEBUGCTL reset.
+
+Restoration of Xen's debugging setting needs to happen from the very first
+vmexit. Due to the automatic reset, Xen need take no action in the general
+case, and only needs to load a value when debugging is active.
+
+This could be fixed by using a host MSR load list entry set up during
+construct_vmcs(). However, a more efficient option is to use an alternative
+block in the VMExit path, keyed on whether hypervisor debugging has been
+enabled.
+
+In order to set this up, drop the per cpu ler_msr variable (as there is no
+point having it per cpu when it will be the same everywhere), and use a single
+read_mostly variable instead. Split calc_ler_msr() out of percpu_traps_init()
+for clarity.
+
+Finally, clean up do_debug(). Reinstate LBR early to help catch cascade
+errors, which allows for the removal of the out label.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+(cherry picked from commit 730dc8d2c9e1b6402e66973cf99a7c56bc78be4c)
+---
+ xen/arch/x86/hvm/vmx/entry.S | 9 +++++
+ xen/arch/x86/hvm/vmx/vmx.c | 3 +-
+ xen/arch/x86/traps.c | 64 +++++++++++++++----------------
+ xen/arch/x86/x86_64/traps.c | 7 ++--
+ xen/include/asm-x86/cpufeature.h | 1 +
+ xen/include/asm-x86/cpufeatures.h | 1 +
+ xen/include/asm-x86/msr.h | 2 +-
+ 7 files changed, 47 insertions(+), 40 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
+index aa2f103895..afd552f2b9 100644
+--- a/xen/arch/x86/hvm/vmx/entry.S
++++ b/xen/arch/x86/hvm/vmx/entry.S
+@@ -41,6 +41,15 @@ ENTRY(vmx_asm_vmexit_handler)
+ SPEC_CTRL_ENTRY_FROM_HVM /* Req: b=curr %rsp=regs/cpuinfo, Clob: acd */
+ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+
++ /* Hardware clears MSR_DEBUGCTL on VMExit. Reinstate it if debugging Xen. */
++ .macro restore_lbr
++ mov $IA32_DEBUGCTLMSR_LBR, %eax
++ mov $MSR_IA32_DEBUGCTLMSR, %ecx
++ xor %edx, %edx
++ wrmsr
++ .endm
++ ALTERNATIVE "", restore_lbr, X86_FEATURE_XEN_LBR
++
+ mov %rsp,%rdi
+ call vmx_vmexit_handler
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 7189820bfc..bb164359bb 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -3124,8 +3124,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ }
+ }
+
+- if ( (rc < 0) ||
+- (msr_content && (vmx_add_host_load_msr(msr) < 0)) )
++ if ( rc < 0 )
+ hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC);
+ else
+ __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
+diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
+index 9f045a2045..789d7ff8cd 100644
+--- a/xen/arch/x86/traps.c
++++ b/xen/arch/x86/traps.c
+@@ -96,8 +96,6 @@ string_param("nmi", opt_nmi);
+ DEFINE_PER_CPU(uint64_t, efer);
+ static DEFINE_PER_CPU(unsigned long, last_extable_addr);
+
+-DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
+-
+ DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
+ DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);
+
+@@ -117,6 +115,9 @@ integer_param("debug_stack_lines", debug_stack_lines);
+ static bool opt_ler;
+ boolean_param("ler", opt_ler);
+
++/* LastExceptionFromIP on this hardware. Zero if LER is not in use. */
++unsigned int __read_mostly ler_msr;
++
+ #define stack_words_per_line 4
+ #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
+
+@@ -1778,17 +1779,6 @@ void do_device_not_available(struct cpu_user_regs *regs)
+ return;
+ }
+
+-static void ler_enable(void)
+-{
+- u64 debugctl;
+-
+- if ( !this_cpu(ler_msr) )
+- return;
+-
+- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR);
+-}
+-
+ void do_debug(struct cpu_user_regs *regs)
+ {
+ unsigned long dr6;
+@@ -1821,6 +1811,10 @@ void do_debug(struct cpu_user_regs *regs)
+ */
+ write_debugreg(6, X86_DR6_DEFAULT);
+
++ /* #DB automatically disabled LBR. Reinstate it if debugging Xen. */
++ if ( cpu_has_xen_lbr )
++ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
++
+ if ( !guest_mode(regs) )
+ {
+ /*
+@@ -1838,7 +1832,7 @@ void do_debug(struct cpu_user_regs *regs)
+ {
+ if ( regs->rip == (unsigned long)sysenter_eflags_saved )
+ regs->eflags &= ~X86_EFLAGS_TF;
+- goto out;
++ return;
+ }
+ if ( !debugger_trap_fatal(TRAP_debug, regs) )
+ {
+@@ -1895,20 +1889,14 @@ void do_debug(struct cpu_user_regs *regs)
+ regs->cs, _p(regs->rip), _p(regs->rip),
+ regs->ss, _p(regs->rsp), dr6);
+
+- goto out;
++ return;
+ }
+
+ /* Save debug status register where guest OS can peek at it */
+ v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
+ v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);
+
+- ler_enable();
+ pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
+- return;
+-
+- out:
+- ler_enable();
+- return;
+ }
+
+ static void __init noinline __set_intr_gate(unsigned int n,
+@@ -1952,38 +1940,46 @@ void load_TR(void)
+ : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
+ }
+
+-void percpu_traps_init(void)
++static unsigned int calc_ler_msr(void)
+ {
+- subarch_percpu_traps_init();
+-
+- if ( !opt_ler )
+- return;
+-
+ switch ( boot_cpu_data.x86_vendor )
+ {
+ case X86_VENDOR_INTEL:
+ switch ( boot_cpu_data.x86 )
+ {
+ case 6:
+- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
+- break;
++ return MSR_IA32_LASTINTFROMIP;
++
+ case 15:
+- this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
+- break;
++ return MSR_P4_LER_FROM_LIP;
+ }
+ break;
++
+ case X86_VENDOR_AMD:
+ switch ( boot_cpu_data.x86 )
+ {
+ case 6:
+ case 0xf ... 0x17:
+- this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
+- break;
++ return MSR_IA32_LASTINTFROMIP;
+ }
+ break;
+ }
+
+- ler_enable();
++ return 0;
++}
++
++void percpu_traps_init(void)
++{
++ subarch_percpu_traps_init();
++
++ if ( !opt_ler )
++ return;
++
++ if ( !ler_msr && (ler_msr = calc_ler_msr()) )
++ setup_force_cpu_cap(X86_FEATURE_XEN_LBR);
++
++ if ( cpu_has_xen_lbr )
++ wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
+ }
+
+ void __init init_idt_traps(void)
+diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
+index f7f6928d70..b0401850ef 100644
+--- a/xen/arch/x86/x86_64/traps.c
++++ b/xen/arch/x86/x86_64/traps.c
+@@ -144,11 +144,12 @@ void show_registers(const struct cpu_user_regs *regs)
+ printk("CPU: %d\n", smp_processor_id());
+ _show_registers(&fault_regs, fault_crs, context, v);
+
+- if ( this_cpu(ler_msr) && !guest_mode(regs) )
++ if ( ler_msr && !guest_mode(regs) )
+ {
+ u64 from, to;
+- rdmsrl(this_cpu(ler_msr), from);
+- rdmsrl(this_cpu(ler_msr) + 1, to);
++
++ rdmsrl(ler_msr, from);
++ rdmsrl(ler_msr + 1, to);
+ printk("ler: %016lx -> %016lx\n", from, to);
+ }
+ }
+diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
+index 2cf8f7ea2a..b237da165c 100644
+--- a/xen/include/asm-x86/cpufeature.h
++++ b/xen/include/asm-x86/cpufeature.h
+@@ -113,6 +113,7 @@
+ #define cpu_has_aperfmperf boot_cpu_has(X86_FEATURE_APERFMPERF)
+ #define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH)
+ #define cpu_has_no_xpti boot_cpu_has(X86_FEATURE_NO_XPTI)
++#define cpu_has_xen_lbr boot_cpu_has(X86_FEATURE_XEN_LBR)
+
+ enum _cache_type {
+ CACHE_TYPE_NULL = 0,
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index b90aa2d046..8e5cc53dde 100644
+--- a/xen/include/asm-x86/cpufeatures.h
++++ b/xen/include/asm-x86/cpufeatures.h
+@@ -32,3 +32,4 @@ XEN_CPUFEATURE(SC_RSB_PV, (FSCAPINTS+0)*32+18) /* RSB overwrite needed for
+ XEN_CPUFEATURE(SC_RSB_HVM, (FSCAPINTS+0)*32+19) /* RSB overwrite needed for HVM */
+ XEN_CPUFEATURE(NO_XPTI, (FSCAPINTS+0)*32+20) /* XPTI mitigation not in use */
+ XEN_CPUFEATURE(SC_MSR_IDLE, (FSCAPINTS+0)*32+21) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
++XEN_CPUFEATURE(XEN_LBR, (FSCAPINTS+0)*32+22) /* Xen uses MSR_DEBUGCTL.LBR */
+diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
+index f14f265aa5..afbeb7f155 100644
+--- a/xen/include/asm-x86/msr.h
++++ b/xen/include/asm-x86/msr.h
+@@ -241,7 +241,7 @@ static inline void write_efer(uint64_t val)
+ wrmsrl(MSR_EFER, val);
+ }
+
+-DECLARE_PER_CPU(u32, ler_msr);
++extern unsigned int ler_msr;
+
+ DECLARE_PER_CPU(uint32_t, tsc_aux);
+
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0020-x86-vmx-Defer-vmx_vmcs_exit-as-long-as-possible-in-c.patch b/emulators/xen-kernel411/files/0020-x86-vmx-Defer-vmx_vmcs_exit-as-long-as-possible-in-c.patch
new file mode 100644
index 000000000000..6ad003fdafaa
--- /dev/null
+++ b/emulators/xen-kernel411/files/0020-x86-vmx-Defer-vmx_vmcs_exit-as-long-as-possible-in-c.patch
@@ -0,0 +1,63 @@
+From 61cc8769a917c646b9bc99ee8adbea602f8d50d2 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 28 May 2018 15:02:34 +0100
+Subject: [PATCH 20/42] x86/vmx: Defer vmx_vmcs_exit() as long as possible in
+ construct_vmcs()
+
+paging_update_paging_modes() and vmx_vlapic_msr_changed() both operate on the
+VMCS being constructed. Avoid dropping and re-acquiring the reference
+multiple times.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+(cherry picked from commit f30e3cf34042846e391e3f8361fc6a76d181a7ee)
+---
+ xen/arch/x86/hvm/vmx/vmcs.c | 12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 258fc08f72..15d63663e5 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -996,6 +996,7 @@ static int construct_vmcs(struct vcpu *v)
+ struct domain *d = v->domain;
+ u32 vmexit_ctl = vmx_vmexit_control;
+ u32 vmentry_ctl = vmx_vmentry_control;
++ int rc = 0;
+
+ vmx_vmcs_enter(v);
+
+@@ -1083,8 +1084,8 @@ static int construct_vmcs(struct vcpu *v)
+
+ if ( msr_bitmap == NULL )
+ {
+- vmx_vmcs_exit(v);
+- return -ENOMEM;
++ rc = -ENOMEM;
++ goto out;
+ }
+
+ memset(msr_bitmap, ~0, PAGE_SIZE);
+@@ -1268,14 +1269,15 @@ static int construct_vmcs(struct vcpu *v)
+ if ( cpu_has_vmx_tsc_scaling )
+ __vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio);
+
+- vmx_vmcs_exit(v);
+-
+ /* will update HOST & GUEST_CR3 as reqd */
+ paging_update_paging_modes(v);
+
+ vmx_vlapic_msr_changed(v);
+
+- return 0;
++ out:
++ vmx_vmcs_exit(v);
++
++ return rc;
+ }
+
+ static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0021-x86-vmx-API-improvements-for-MSR-load-save-infrastru.patch b/emulators/xen-kernel411/files/0021-x86-vmx-API-improvements-for-MSR-load-save-infrastru.patch
new file mode 100644
index 000000000000..d70df8740336
--- /dev/null
+++ b/emulators/xen-kernel411/files/0021-x86-vmx-API-improvements-for-MSR-load-save-infrastru.patch
@@ -0,0 +1,309 @@
+From 935e9c404714f5fa6d31890034a7e2cc11c6e0b9 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 May 2018 11:57:00 +0100
+Subject: [PATCH 21/42] x86/vmx: API improvements for MSR load/save
+ infrastructure
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Collect together related infrastructure in vmcs.h, rather than having it
+spread out. Turn vmx_{read,write}_guest_msr() into static inlines, as they
+are simple enough.
+
+Replace 'int type' with 'enum vmx_msr_list_type', and use switch statements
+internally. Later changes are going to introduce a new type.
+
+Rename the type identifiers for consistency with the other VMX_MSR_*
+constants.
+
+No functional change.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+(cherry picked from commit f54b63e8617ada823be43d60467a43c8224b7909)
+---
+ xen/arch/x86/hvm/vmx/vmcs.c | 93 +++++++++++++-----------------
+ xen/arch/x86/hvm/vmx/vmx.c | 8 +--
+ xen/include/asm-x86/hvm/vmx/vmcs.h | 62 +++++++++++++++-----
+ 3 files changed, 91 insertions(+), 72 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 15d63663e5..6bc6597242 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -1293,22 +1293,26 @@ static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
+ return 0;
+ }
+
+-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
++struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
+ {
+ struct vcpu *curr = current;
+ unsigned int msr_count;
+- struct vmx_msr_entry *msr_area;
++ struct vmx_msr_entry *msr_area = NULL;
+
+- if ( type == VMX_GUEST_MSR )
++ switch ( type )
+ {
+- msr_count = curr->arch.hvm_vmx.msr_count;
+- msr_area = curr->arch.hvm_vmx.msr_area;
+- }
+- else
+- {
+- ASSERT(type == VMX_HOST_MSR);
++ case VMX_MSR_HOST:
+ msr_count = curr->arch.hvm_vmx.host_msr_count;
+ msr_area = curr->arch.hvm_vmx.host_msr_area;
++ break;
++
++ case VMX_MSR_GUEST:
++ msr_count = curr->arch.hvm_vmx.msr_count;
++ msr_area = curr->arch.hvm_vmx.msr_area;
++ break;
++
++ default:
++ ASSERT_UNREACHABLE();
+ }
+
+ if ( msr_area == NULL )
+@@ -1318,48 +1322,27 @@ struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
+ vmx_msr_entry_key_cmp);
+ }
+
+-int vmx_read_guest_msr(u32 msr, u64 *val)
+-{
+- struct vmx_msr_entry *ent;
+-
+- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
+- {
+- *val = ent->data;
+- return 0;
+- }
+-
+- return -ESRCH;
+-}
+-
+-int vmx_write_guest_msr(u32 msr, u64 val)
+-{
+- struct vmx_msr_entry *ent;
+-
+- if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
+- {
+- ent->data = val;
+- return 0;
+- }
+-
+- return -ESRCH;
+-}
+-
+-int vmx_add_msr(u32 msr, int type)
++int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+ {
+ struct vcpu *curr = current;
+ unsigned int idx, *msr_count;
+ struct vmx_msr_entry **msr_area, *msr_area_elem;
+
+- if ( type == VMX_GUEST_MSR )
++ switch ( type )
+ {
+- msr_count = &curr->arch.hvm_vmx.msr_count;
+- msr_area = &curr->arch.hvm_vmx.msr_area;
+- }
+- else
+- {
+- ASSERT(type == VMX_HOST_MSR);
++ case VMX_MSR_HOST:
+ msr_count = &curr->arch.hvm_vmx.host_msr_count;
+ msr_area = &curr->arch.hvm_vmx.host_msr_area;
++ break;
++
++ case VMX_MSR_GUEST:
++ msr_count = &curr->arch.hvm_vmx.msr_count;
++ msr_area = &curr->arch.hvm_vmx.msr_area;
++ break;
++
++ default:
++ ASSERT_UNREACHABLE();
++ return -EINVAL;
+ }
+
+ if ( *msr_area == NULL )
+@@ -1367,13 +1350,17 @@ int vmx_add_msr(u32 msr, int type)
+ if ( (*msr_area = alloc_xenheap_page()) == NULL )
+ return -ENOMEM;
+
+- if ( type == VMX_GUEST_MSR )
++ switch ( type )
+ {
++ case VMX_MSR_HOST:
++ __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
++ break;
++
++ case VMX_MSR_GUEST:
+ __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
+ __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
++ break;
+ }
+- else
+- __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
+ }
+
+ for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
+@@ -1392,16 +1379,18 @@ int vmx_add_msr(u32 msr, int type)
+
+ ++*msr_count;
+
+- if ( type == VMX_GUEST_MSR )
++ switch ( type )
+ {
++ case VMX_MSR_HOST:
++ rdmsrl(msr, msr_area_elem->data);
++ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
++ break;
++
++ case VMX_MSR_GUEST:
+ msr_area_elem->data = 0;
+ __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
+ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
+- }
+- else
+- {
+- rdmsrl(msr, msr_area_elem->data);
+- __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
++ break;
+ }
+
+ return 0;
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index bb164359bb..d4ebae8945 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -4169,7 +4169,7 @@ static void lbr_tsx_fixup(void)
+ struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
+ struct vmx_msr_entry *msr;
+
+- if ( (msr = vmx_find_msr(lbr_from_start, VMX_GUEST_MSR)) != NULL )
++ if ( (msr = vmx_find_msr(lbr_from_start, VMX_MSR_GUEST)) != NULL )
+ {
+ /*
+ * Sign extend into bits 61:62 while preserving bit 63
+@@ -4179,7 +4179,7 @@ static void lbr_tsx_fixup(void)
+ msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
+ }
+
+- if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_GUEST_MSR)) != NULL )
++ if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
+ msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
+ }
+
+@@ -4207,8 +4207,8 @@ static void bdw_erratum_bdf14_fixup(void)
+ * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by
+ * sign-extending into bits 48:63.
+ */
+- sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_GUEST_MSR);
+- sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_GUEST_MSR);
++ sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
++ sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
+ }
+
+ static void lbr_fixup(void)
+diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
+index 06c3179cec..20882d13e0 100644
+--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
+@@ -514,9 +514,6 @@ enum vmcs_field {
+
+ #define VMCS_VPID_WIDTH 16
+
+-#define VMX_GUEST_MSR 0
+-#define VMX_HOST_MSR 1
+-
+ /* VM Instruction error numbers */
+ enum vmx_insn_errno
+ {
+@@ -534,6 +531,52 @@ enum vmx_insn_errno
+ VMX_INSN_FAIL_INVALID = ~0,
+ };
+
++/* MSR load/save list infrastructure. */
++enum vmx_msr_list_type {
++ VMX_MSR_HOST, /* MSRs loaded on VMExit. */
++ VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */
++};
++
++int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type);
++
++static inline int vmx_add_host_load_msr(uint32_t msr)
++{
++ return vmx_add_msr(msr, VMX_MSR_HOST);
++}
++
++static inline int vmx_add_guest_msr(uint32_t msr)
++{
++ return vmx_add_msr(msr, VMX_MSR_GUEST);
++}
++
++struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type);
++
++static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val)
++{
++ const struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST);
++
++ if ( !ent )
++ return -ESRCH;
++
++ *val = ent->data;
++
++ return 0;
++}
++
++static inline int vmx_write_guest_msr(uint32_t msr, uint64_t val)
++{
++ struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST);
++
++ if ( !ent )
++ return -ESRCH;
++
++ ent->data = val;
++
++ return 0;
++}
++
++
++/* MSR intercept bitmap infrastructure. */
+ enum vmx_msr_intercept_type {
+ VMX_MSR_R = 1,
+ VMX_MSR_W = 2,
+@@ -544,10 +587,6 @@ void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
+ enum vmx_msr_intercept_type type);
+ void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr,
+ enum vmx_msr_intercept_type type);
+-int vmx_read_guest_msr(u32 msr, u64 *val);
+-int vmx_write_guest_msr(u32 msr, u64 val);
+-struct vmx_msr_entry *vmx_find_msr(u32 msr, int type);
+-int vmx_add_msr(u32 msr, int type);
+ void vmx_vmcs_switch(paddr_t from, paddr_t to);
+ void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector);
+ void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector);
+@@ -562,15 +601,6 @@ void virtual_vmcs_vmwrite(const struct vcpu *, u32 encoding, u64 val);
+ enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v,
+ u32 vmcs_encoding, u64 val);
+
+-static inline int vmx_add_guest_msr(u32 msr)
+-{
+- return vmx_add_msr(msr, VMX_GUEST_MSR);
+-}
+-static inline int vmx_add_host_load_msr(u32 msr)
+-{
+- return vmx_add_msr(msr, VMX_HOST_MSR);
+-}
+-
+ DECLARE_PER_CPU(bool_t, vmxon);
+
+ bool_t vmx_vcpu_pml_enabled(const struct vcpu *v);
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0022-x86-vmx-Internal-cleanup-for-MSR-load-save-infrastru.patch b/emulators/xen-kernel411/files/0022-x86-vmx-Internal-cleanup-for-MSR-load-save-infrastru.patch
new file mode 100644
index 000000000000..9c689187b018
--- /dev/null
+++ b/emulators/xen-kernel411/files/0022-x86-vmx-Internal-cleanup-for-MSR-load-save-infrastru.patch
@@ -0,0 +1,171 @@
+From 52b8f9ae22a5daa1f2cad0aa5065b72b48c33ce4 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 May 2018 11:57:00 +0100
+Subject: [PATCH 22/42] x86/vmx: Internal cleanup for MSR load/save
+ infrastructure
+
+ * Use an arch_vmx_struct local variable to reduce later code volume.
+ * Use start/total instead of msr_area/msr_count. This is in preparation for
+ more finegrained handling with later changes.
+ * Use ent/end pointers (again for preparation), and to make the vmx_add_msr()
+ logic easier to follow.
+ * Make the memory allocation block of vmx_add_msr() unlikely, and calculate
+ virt_to_maddr() just once.
+
+No practical change to functionality.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+(cherry picked from commit 94fda356fcdcc847662a4c9f6cc63511f25c1247)
+---
+ xen/arch/x86/hvm/vmx/vmcs.c | 75 ++++++++++++++++++++-----------------
+ 1 file changed, 40 insertions(+), 35 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 6bc6597242..a6ddba3132 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -1296,48 +1296,49 @@ static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
+ struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
+ {
+ struct vcpu *curr = current;
+- unsigned int msr_count;
+- struct vmx_msr_entry *msr_area = NULL;
++ struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
++ struct vmx_msr_entry *start = NULL;
++ unsigned int total;
+
+ switch ( type )
+ {
+ case VMX_MSR_HOST:
+- msr_count = curr->arch.hvm_vmx.host_msr_count;
+- msr_area = curr->arch.hvm_vmx.host_msr_area;
++ start = vmx->host_msr_area;
++ total = vmx->host_msr_count;
+ break;
+
+ case VMX_MSR_GUEST:
+- msr_count = curr->arch.hvm_vmx.msr_count;
+- msr_area = curr->arch.hvm_vmx.msr_area;
++ start = vmx->msr_area;
++ total = vmx->msr_count;
+ break;
+
+ default:
+ ASSERT_UNREACHABLE();
+ }
+
+- if ( msr_area == NULL )
++ if ( !start )
+ return NULL;
+
+- return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry),
+- vmx_msr_entry_key_cmp);
++ return bsearch(&msr, start, total, sizeof(*start), vmx_msr_entry_key_cmp);
+ }
+
+ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+ {
+ struct vcpu *curr = current;
+- unsigned int idx, *msr_count;
+- struct vmx_msr_entry **msr_area, *msr_area_elem;
++ struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
++ struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
++ unsigned int total;
+
+ switch ( type )
+ {
+ case VMX_MSR_HOST:
+- msr_count = &curr->arch.hvm_vmx.host_msr_count;
+- msr_area = &curr->arch.hvm_vmx.host_msr_area;
++ ptr = &vmx->host_msr_area;
++ total = vmx->host_msr_count;
+ break;
+
+ case VMX_MSR_GUEST:
+- msr_count = &curr->arch.hvm_vmx.msr_count;
+- msr_area = &curr->arch.hvm_vmx.msr_area;
++ ptr = &vmx->msr_area;
++ total = vmx->msr_count;
+ break;
+
+ default:
+@@ -1345,51 +1346,55 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+ return -EINVAL;
+ }
+
+- if ( *msr_area == NULL )
++ /* Allocate memory on first use. */
++ if ( unlikely(!*ptr) )
+ {
+- if ( (*msr_area = alloc_xenheap_page()) == NULL )
++ paddr_t addr;
++
++ if ( (*ptr = alloc_xenheap_page()) == NULL )
+ return -ENOMEM;
+
++ addr = virt_to_maddr(*ptr);
++
+ switch ( type )
+ {
+ case VMX_MSR_HOST:
+- __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
++ __vmwrite(VM_EXIT_MSR_LOAD_ADDR, addr);
+ break;
+
+ case VMX_MSR_GUEST:
+- __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
+- __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
++ __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr);
++ __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr);
+ break;
+ }
+ }
+
+- for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
+- if ( (*msr_area)[idx].index == msr )
++ start = *ptr;
++ end = start + total;
++
++ for ( ent = start; ent < end && ent->index <= msr; ++ent )
++ if ( ent->index == msr )
+ return 0;
+
+- if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
++ if ( total == (PAGE_SIZE / sizeof(*ent)) )
+ return -ENOSPC;
+
+- memmove(*msr_area + idx + 1, *msr_area + idx,
+- sizeof(*msr_area_elem) * (*msr_count - idx));
+-
+- msr_area_elem = *msr_area + idx;
+- msr_area_elem->index = msr;
+- msr_area_elem->mbz = 0;
++ memmove(ent + 1, ent, sizeof(*ent) * (end - ent));
+
+- ++*msr_count;
++ ent->index = msr;
++ ent->mbz = 0;
+
+ switch ( type )
+ {
+ case VMX_MSR_HOST:
+- rdmsrl(msr, msr_area_elem->data);
+- __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
++ rdmsrl(msr, ent->data);
++ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count);
+ break;
+
+ case VMX_MSR_GUEST:
+- msr_area_elem->data = 0;
+- __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
+- __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
++ ent->data = 0;
++ __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count);
++ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count);
+ break;
+ }
+
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0023-x86-vmx-Factor-locate_msr_entry-out-of-vmx_find_msr-.patch b/emulators/xen-kernel411/files/0023-x86-vmx-Factor-locate_msr_entry-out-of-vmx_find_msr-.patch
new file mode 100644
index 000000000000..8ebb71c2f943
--- /dev/null
+++ b/emulators/xen-kernel411/files/0023-x86-vmx-Factor-locate_msr_entry-out-of-vmx_find_msr-.patch
@@ -0,0 +1,104 @@
+From b52017c904ae770ab86a62bf3219ee21d23bb55b Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 May 2018 11:57:00 +0100
+Subject: [PATCH 23/42] x86/vmx: Factor locate_msr_entry() out of
+ vmx_find_msr() and vmx_add_msr()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Instead of having multiple algorithms searching the MSR lists, implement a
+single one. It has the semantics required by vmx_add_msr(), to identify the
+position in which an MSR should live, if it isn't already present.
+
+There will be a marginal improvement for vmx_find_msr() by avoiding the
+function pointer calls to vmx_msr_entry_key_cmp(), and a major improvement for
+vmx_add_msr() by using a binary search instead of a linear search.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+(cherry picked from commit 4d94828cf11104256dccea1fa7762f00575dfaa0)
+---
+ xen/arch/x86/hvm/vmx/vmcs.c | 41 +++++++++++++++++++++++++------------
+ 1 file changed, 28 insertions(+), 13 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index a6ddba3132..c75b0ee5c3 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -1280,24 +1280,36 @@ static int construct_vmcs(struct vcpu *v)
+ return rc;
+ }
+
+-static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
++/*
++ * Search an MSR list looking for an MSR entry, or the slot in which it should
++ * live (to keep the data sorted) if an entry is not found.
++ *
++ * The return pointer is guaranteed to be bounded by start and end. However,
++ * it may point at end, and may be invalid for the caller to dereference.
++ */
++static struct vmx_msr_entry *locate_msr_entry(
++ struct vmx_msr_entry *start, struct vmx_msr_entry *end, uint32_t msr)
+ {
+- const u32 *msr = key;
+- const struct vmx_msr_entry *entry = elt;
++ while ( start < end )
++ {
++ struct vmx_msr_entry *mid = start + (end - start) / 2;
+
+- if ( *msr > entry->index )
+- return 1;
+- if ( *msr < entry->index )
+- return -1;
++ if ( msr < mid->index )
++ end = mid;
++ else if ( msr > mid->index )
++ start = mid + 1;
++ else
++ return mid;
++ }
+
+- return 0;
++ return start;
+ }
+
+ struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
+ {
+ struct vcpu *curr = current;
+ struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
+- struct vmx_msr_entry *start = NULL;
++ struct vmx_msr_entry *start = NULL, *ent, *end;
+ unsigned int total;
+
+ switch ( type )
+@@ -1319,7 +1331,10 @@ struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
+ if ( !start )
+ return NULL;
+
+- return bsearch(&msr, start, total, sizeof(*start), vmx_msr_entry_key_cmp);
++ end = start + total;
++ ent = locate_msr_entry(start, end, msr);
++
++ return ((ent < end) && (ent->index == msr)) ? ent : NULL;
+ }
+
+ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+@@ -1371,10 +1386,10 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+
+ start = *ptr;
+ end = start + total;
++ ent = locate_msr_entry(start, end, msr);
+
+- for ( ent = start; ent < end && ent->index <= msr; ++ent )
+- if ( ent->index == msr )
+- return 0;
++ if ( (ent < end) && (ent->index == msr) )
++ return 0;
+
+ if ( total == (PAGE_SIZE / sizeof(*ent)) )
+ return -ENOSPC;
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0024-x86-vmx-Support-remote-access-to-the-MSR-lists.patch b/emulators/xen-kernel411/files/0024-x86-vmx-Support-remote-access-to-the-MSR-lists.patch
new file mode 100644
index 000000000000..675fc57cc5e7
--- /dev/null
+++ b/emulators/xen-kernel411/files/0024-x86-vmx-Support-remote-access-to-the-MSR-lists.patch
@@ -0,0 +1,354 @@
+From 218d403ad944f47548752d4a60e8f77e5f8e1950 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 May 2018 11:57:00 +0100
+Subject: [PATCH 24/42] x86/vmx: Support remote access to the MSR lists
+
+At the moment, all modifications of the MSR lists are in current context.
+However, future changes may need to put MSR_EFER into the lists from domctl
+hypercall context.
+
+Plumb a struct vcpu parameter down through the infrastructure, and use
+vmx_vmcs_{enter,exit}() for safe access to the VMCS in vmx_add_msr(). Use
+assertions to ensure that access is either in current context, or while the
+vcpu is paused.
+
+Note these expectations beside the fields in arch_vmx_struct, and reorder the
+fields to avoid unnecessary padding.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 80599f0b770199116aa753bfdfac9bfe2e8ea86a)
+---
+ xen/arch/x86/cpu/vpmu_intel.c | 14 +++++------
+ xen/arch/x86/hvm/vmx/vmcs.c | 40 ++++++++++++++++++++++--------
+ xen/arch/x86/hvm/vmx/vmx.c | 22 ++++++++--------
+ xen/include/asm-x86/hvm/vmx/vmcs.h | 34 ++++++++++++++++---------
+ xen/include/xen/sched.h | 2 +-
+ 5 files changed, 72 insertions(+), 40 deletions(-)
+
+diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
+index 207e2e712c..c499e69f2f 100644
+--- a/xen/arch/x86/cpu/vpmu_intel.c
++++ b/xen/arch/x86/cpu/vpmu_intel.c
+@@ -455,12 +455,12 @@ static int core2_vpmu_alloc_resource(struct vcpu *v)
+ if ( is_hvm_vcpu(v) )
+ {
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+- if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
++ if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
+ goto out_err;
+
+- if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
++ if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
+ goto out_err;
+- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
++ vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) +
+@@ -613,7 +613,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+ return -EINVAL;
+
+ if ( is_hvm_vcpu(v) )
+- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
+ &core2_vpmu_cxt->global_ctrl);
+ else
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
+@@ -682,7 +682,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+ return -EINVAL;
+
+ if ( is_hvm_vcpu(v) )
+- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL,
+ &core2_vpmu_cxt->global_ctrl);
+ else
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
+@@ -701,7 +701,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+ else
+ {
+ if ( is_hvm_vcpu(v) )
+- vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
++ vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+ else
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+ }
+@@ -735,7 +735,7 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if ( is_hvm_vcpu(v) )
+- vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
++ vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+ else
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, *msr_content);
+ break;
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index c75b0ee5c3..e86f292fbc 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -1305,13 +1305,15 @@ static struct vmx_msr_entry *locate_msr_entry(
+ return start;
+ }
+
+-struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
++struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
++ enum vmx_msr_list_type type)
+ {
+- struct vcpu *curr = current;
+- struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
++ const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
+ struct vmx_msr_entry *start = NULL, *ent, *end;
+ unsigned int total;
+
++ ASSERT(v == current || !vcpu_runnable(v));
++
+ switch ( type )
+ {
+ case VMX_MSR_HOST:
+@@ -1337,12 +1339,14 @@ struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type)
+ return ((ent < end) && (ent->index == msr)) ? ent : NULL;
+ }
+
+-int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
++int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type)
+ {
+- struct vcpu *curr = current;
+- struct arch_vmx_struct *vmx = &curr->arch.hvm_vmx;
++ struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
+ struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
+ unsigned int total;
++ int rc;
++
++ ASSERT(v == current || !vcpu_runnable(v));
+
+ switch ( type )
+ {
+@@ -1361,13 +1365,18 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+ return -EINVAL;
+ }
+
++ vmx_vmcs_enter(v);
++
+ /* Allocate memory on first use. */
+ if ( unlikely(!*ptr) )
+ {
+ paddr_t addr;
+
+ if ( (*ptr = alloc_xenheap_page()) == NULL )
+- return -ENOMEM;
++ {
++ rc = -ENOMEM;
++ goto out;
++ }
+
+ addr = virt_to_maddr(*ptr);
+
+@@ -1389,10 +1398,16 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+ ent = locate_msr_entry(start, end, msr);
+
+ if ( (ent < end) && (ent->index == msr) )
+- return 0;
++ {
++ rc = 0;
++ goto out;
++ }
+
+ if ( total == (PAGE_SIZE / sizeof(*ent)) )
+- return -ENOSPC;
++ {
++ rc = -ENOSPC;
++ goto out;
++ }
+
+ memmove(ent + 1, ent, sizeof(*ent) * (end - ent));
+
+@@ -1413,7 +1428,12 @@ int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type)
+ break;
+ }
+
+- return 0;
++ rc = 0;
++
++ out:
++ vmx_vmcs_exit(v);
++
++ return rc;
+ }
+
+ void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector)
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index d4ebae8945..95162bf187 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -2822,7 +2822,7 @@ static int is_last_branch_msr(u32 ecx)
+
+ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
+ {
+- const struct vcpu *curr = current;
++ struct vcpu *curr = current;
+
+ HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x", msr);
+
+@@ -2901,7 +2901,7 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
+ if ( passive_domain_do_rdmsr(msr, msr_content) )
+ goto done;
+
+- if ( vmx_read_guest_msr(msr, msr_content) == 0 )
++ if ( vmx_read_guest_msr(curr, msr, msr_content) == 0 )
+ break;
+
+ if ( is_last_branch_msr(msr) )
+@@ -3113,7 +3113,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+
+ for ( ; (rc == 0) && lbr->count; lbr++ )
+ for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
+- if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
++ if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
+ {
+ vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
+ if ( lbr_tsx_fixup_needed )
+@@ -3153,7 +3153,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ if ( wrmsr_viridian_regs(msr, msr_content) )
+ break;
+
+- if ( vmx_write_guest_msr(msr, msr_content) == 0 ||
++ if ( vmx_write_guest_msr(v, msr, msr_content) == 0 ||
+ is_last_branch_msr(msr) )
+ break;
+
+@@ -4169,7 +4169,7 @@ static void lbr_tsx_fixup(void)
+ struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
+ struct vmx_msr_entry *msr;
+
+- if ( (msr = vmx_find_msr(lbr_from_start, VMX_MSR_GUEST)) != NULL )
++ if ( (msr = vmx_find_msr(curr, lbr_from_start, VMX_MSR_GUEST)) != NULL )
+ {
+ /*
+ * Sign extend into bits 61:62 while preserving bit 63
+@@ -4179,15 +4179,15 @@ static void lbr_tsx_fixup(void)
+ msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
+ }
+
+- if ( (msr = vmx_find_msr(lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
++ if ( (msr = vmx_find_msr(curr, lbr_lastint_from, VMX_MSR_GUEST)) != NULL )
+ msr->data |= ((LBR_FROM_SIGNEXT_2MSB & msr->data) << 2);
+ }
+
+-static void sign_extend_msr(u32 msr, int type)
++static void sign_extend_msr(struct vcpu *v, u32 msr, int type)
+ {
+ struct vmx_msr_entry *entry;
+
+- if ( (entry = vmx_find_msr(msr, type)) != NULL )
++ if ( (entry = vmx_find_msr(v, msr, type)) != NULL )
+ {
+ if ( entry->data & VADDR_TOP_BIT )
+ entry->data |= CANONICAL_MASK;
+@@ -4198,6 +4198,8 @@ static void sign_extend_msr(u32 msr, int type)
+
+ static void bdw_erratum_bdf14_fixup(void)
+ {
++ struct vcpu *curr = current;
++
+ /*
+ * Occasionally, on certain Broadwell CPUs MSR_IA32_LASTINTTOIP has
+ * been observed to have the top three bits corrupted as though the
+@@ -4207,8 +4209,8 @@ static void bdw_erratum_bdf14_fixup(void)
+ * erratum BDF14. Fix up MSR_IA32_LASTINT{FROM,TO}IP by
+ * sign-extending into bits 48:63.
+ */
+- sign_extend_msr(MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
+- sign_extend_msr(MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
++ sign_extend_msr(curr, MSR_IA32_LASTINTFROMIP, VMX_MSR_GUEST);
++ sign_extend_msr(curr, MSR_IA32_LASTINTTOIP, VMX_MSR_GUEST);
+ }
+
+ static void lbr_fixup(void)
+diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
+index 20882d13e0..62afebec11 100644
+--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
+@@ -130,10 +130,17 @@ struct arch_vmx_struct {
+ uint64_t sfmask;
+
+ struct vmx_msr_bitmap *msr_bitmap;
+- unsigned int msr_count;
++
++ /*
++ * Most accesses to the MSR host/guest load/save lists are in current
++ * context. However, the data can be modified by toolstack/migration
++ * actions. Remote access is only permitted for paused vcpus, and is
++ * protected under the domctl lock.
++ */
+ struct vmx_msr_entry *msr_area;
+- unsigned int host_msr_count;
+ struct vmx_msr_entry *host_msr_area;
++ unsigned int msr_count;
++ unsigned int host_msr_count;
+
+ unsigned long eoi_exitmap_changed;
+ DECLARE_BITMAP(eoi_exit_bitmap, NR_VECTORS);
+@@ -537,23 +544,25 @@ enum vmx_msr_list_type {
+ VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */
+ };
+
+-int vmx_add_msr(uint32_t msr, enum vmx_msr_list_type type);
++int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type);
+
+-static inline int vmx_add_host_load_msr(uint32_t msr)
++static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr)
+ {
+- return vmx_add_msr(msr, VMX_MSR_HOST);
++ return vmx_add_msr(v, msr, VMX_MSR_GUEST);
+ }
+
+-static inline int vmx_add_guest_msr(uint32_t msr)
++static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr)
+ {
+- return vmx_add_msr(msr, VMX_MSR_GUEST);
++ return vmx_add_msr(v, msr, VMX_MSR_HOST);
+ }
+
+-struct vmx_msr_entry *vmx_find_msr(uint32_t msr, enum vmx_msr_list_type type);
++struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
++ enum vmx_msr_list_type type);
+
+-static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val)
++static inline int vmx_read_guest_msr(const struct vcpu *v, uint32_t msr,
++ uint64_t *val)
+ {
+- const struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST);
++ const struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
+
+ if ( !ent )
+ return -ESRCH;
+@@ -563,9 +572,10 @@ static inline int vmx_read_guest_msr(uint32_t msr, uint64_t *val)
+ return 0;
+ }
+
+-static inline int vmx_write_guest_msr(uint32_t msr, uint64_t val)
++static inline int vmx_write_guest_msr(struct vcpu *v, uint32_t msr,
++ uint64_t val)
+ {
+- struct vmx_msr_entry *ent = vmx_find_msr(msr, VMX_MSR_GUEST);
++ struct vmx_msr_entry *ent = vmx_find_msr(v, msr, VMX_MSR_GUEST);
+
+ if ( !ent )
+ return -ESRCH;
+diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
+index 99d2af2e1f..e79d5a36ca 100644
+--- a/xen/include/xen/sched.h
++++ b/xen/include/xen/sched.h
+@@ -788,7 +788,7 @@ static inline struct domain *next_domain_in_cpupool(
+ #define _VPF_parked 8
+ #define VPF_parked (1UL<<_VPF_parked)
+
+-static inline int vcpu_runnable(struct vcpu *v)
++static inline bool vcpu_runnable(const struct vcpu *v)
+ {
+ return !(v->pause_flags |
+ atomic_read(&v->pause_count) |
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0025-x86-vmx-Improvements-to-LBR-MSR-handling.patch b/emulators/xen-kernel411/files/0025-x86-vmx-Improvements-to-LBR-MSR-handling.patch
new file mode 100644
index 000000000000..a6b525801ea8
--- /dev/null
+++ b/emulators/xen-kernel411/files/0025-x86-vmx-Improvements-to-LBR-MSR-handling.patch
@@ -0,0 +1,176 @@
+From cfdd4e846a77ca5510b6c35adeec55014a73efb9 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 May 2018 11:57:00 +0100
+Subject: [PATCH 25/42] x86/vmx: Improvements to LBR MSR handling
+
+The main purpose of this patch is to only ever insert the LBR MSRs into the
+guest load/save list once, as a future patch wants to change the behaviour of
+vmx_add_guest_msr().
+
+The repeated processing of lbr_info and the guests MSR load/save list is
+redundant, and a guest using LBR itself will have to re-enable
+MSR_DEBUGCTL.LBR in its #DB handler, meaning that Xen will repeat this
+redundant processing every time the guest gets a debug exception.
+
+Rename lbr_fixup_enabled to lbr_flags to be a little more generic, and use one
+bit to indicate that the MSRs have been inserted into the load/save list.
+Shorten the existing FIXUP* identifiers to reduce code volume.
+
+Furthermore, handing the guest #MC on an error isn't a legitimate action. Two
+of the three failure cases are definitely hypervisor bugs, and the third is a
+boundary case which shouldn't occur in practice. The guest also won't execute
+correctly, so handle errors by cleanly crashing the guest.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit be73a842e642772d7372004c9c105de35b771020)
+---
+ xen/arch/x86/hvm/vmx/vmx.c | 81 +++++++++++++++++++++---------
+ xen/include/asm-x86/hvm/vmx/vmcs.h | 2 +-
+ 2 files changed, 59 insertions(+), 24 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 95162bf187..5f01652d48 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -2758,8 +2758,10 @@ enum
+
+ #define LBR_FROM_SIGNEXT_2MSB ((1ULL << 59) | (1ULL << 60))
+
+-#define FIXUP_LBR_TSX (1u << 0)
+-#define FIXUP_BDW_ERRATUM_BDF14 (1u << 1)
++#define LBR_MSRS_INSERTED (1u << 0)
++#define LBR_FIXUP_TSX (1u << 1)
++#define LBR_FIXUP_BDF14 (1u << 2)
++#define LBR_FIXUP_MASK (LBR_FIXUP_TSX | LBR_FIXUP_BDF14)
+
+ static bool __read_mostly lbr_tsx_fixup_needed;
+ static bool __read_mostly bdw_erratum_bdf14_fixup_needed;
+@@ -3094,7 +3096,6 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ break;
+
+ case MSR_IA32_DEBUGCTLMSR: {
+- int i, rc = 0;
+ uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF;
+
+ if ( boot_cpu_has(X86_FEATURE_RTM) )
+@@ -3105,30 +3106,64 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ if ( vpmu_do_wrmsr(msr, msr_content, supported) )
+ break;
+ }
+- if ( msr_content & IA32_DEBUGCTLMSR_LBR )
++
++ /*
++ * When a guest first enables LBR, arrange to save and restore the LBR
++ * MSRs and allow the guest direct access.
++ *
++ * MSR_DEBUGCTL and LBR has existed almost as long as MSRs have
++ * existed, and there is no architectural way to hide the feature, or
++ * fail the attempt to enable LBR.
++ *
++ * Unknown host LBR MSRs or hitting -ENOSPC with the guest load/save
++ * list are definitely hypervisor bugs, whereas -ENOMEM for allocating
++ * the load/save list is simply unlucky (and shouldn't occur with
++ * sensible management by the toolstack).
++ *
++ * Either way, there is nothing we can do right now to recover, and
++ * the guest won't execute correctly either. Simply crash the domain
++ * to make the failure obvious.
++ */
++ if ( !(v->arch.hvm_vmx.lbr_flags & LBR_MSRS_INSERTED) &&
++ (msr_content & IA32_DEBUGCTLMSR_LBR) )
+ {
+ const struct lbr_info *lbr = last_branch_msr_get();
+- if ( lbr == NULL )
+- break;
+
+- for ( ; (rc == 0) && lbr->count; lbr++ )
+- for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
+- if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
++ if ( unlikely(!lbr) )
++ {
++ gprintk(XENLOG_ERR, "Unknown Host LBR MSRs\n");
++ domain_crash(v->domain);
++ return X86EMUL_OKAY;
++ }
++
++ for ( ; lbr->count; lbr++ )
++ {
++ unsigned int i;
++
++ for ( i = 0; i < lbr->count; i++ )
++ {
++ int rc = vmx_add_guest_msr(v, lbr->base + i);
++
++ if ( unlikely(rc) )
+ {
+- vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
+- if ( lbr_tsx_fixup_needed )
+- v->arch.hvm_vmx.lbr_fixup_enabled |= FIXUP_LBR_TSX;
+- if ( bdw_erratum_bdf14_fixup_needed )
+- v->arch.hvm_vmx.lbr_fixup_enabled |=
+- FIXUP_BDW_ERRATUM_BDF14;
++ gprintk(XENLOG_ERR,
++ "Guest load/save list error %d\n", rc);
++ domain_crash(v->domain);
++ return X86EMUL_OKAY;
+ }
+- }
+
+- if ( rc < 0 )
+- hvm_inject_hw_exception(TRAP_machine_check, X86_EVENT_NO_EC);
+- else
+- __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
++ vmx_clear_msr_intercept(v, lbr->base + i, VMX_MSR_RW);
++ }
++ }
++
++ v->arch.hvm_vmx.lbr_flags |= LBR_MSRS_INSERTED;
++ if ( lbr_tsx_fixup_needed )
++ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_TSX;
++ if ( bdw_erratum_bdf14_fixup_needed )
++ v->arch.hvm_vmx.lbr_flags |= LBR_FIXUP_BDF14;
++ }
+
++ __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
+ break;
+ }
+ case MSR_IA32_FEATURE_CONTROL:
+@@ -4217,9 +4252,9 @@ static void lbr_fixup(void)
+ {
+ struct vcpu *curr = current;
+
+- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_LBR_TSX )
++ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_TSX )
+ lbr_tsx_fixup();
+- if ( curr->arch.hvm_vmx.lbr_fixup_enabled & FIXUP_BDW_ERRATUM_BDF14 )
++ if ( curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_BDF14 )
+ bdw_erratum_bdf14_fixup();
+ }
+
+@@ -4287,7 +4322,7 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs)
+ }
+
+ out:
+- if ( unlikely(curr->arch.hvm_vmx.lbr_fixup_enabled) )
++ if ( unlikely(curr->arch.hvm_vmx.lbr_flags & LBR_FIXUP_MASK) )
+ lbr_fixup();
+
+ HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
+diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
+index 62afebec11..2c9e291bee 100644
+--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
+@@ -156,7 +156,7 @@ struct arch_vmx_struct {
+ /* Are we emulating rather than VMENTERing? */
+ uint8_t vmx_emulate;
+
+- uint8_t lbr_fixup_enabled;
++ uint8_t lbr_flags;
+
+ /* Bitmask of segments that we can't safely use in virtual 8086 mode */
+ uint16_t vm86_segment_mask;
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0026-x86-vmx-Pass-an-MSR-value-into-vmx_msr_add.patch b/emulators/xen-kernel411/files/0026-x86-vmx-Pass-an-MSR-value-into-vmx_msr_add.patch
new file mode 100644
index 000000000000..b35a79ffebdc
--- /dev/null
+++ b/emulators/xen-kernel411/files/0026-x86-vmx-Pass-an-MSR-value-into-vmx_msr_add.patch
@@ -0,0 +1,148 @@
+From 8b35b978a273a153ceadccd9c02d433f8be1c9bd Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 May 2018 11:57:00 +0100
+Subject: [PATCH 26/42] x86/vmx: Pass an MSR value into vmx_msr_add()
+
+The main purpose of this change is to allow us to set a specific MSR value,
+without needing to know whether there is already a load/save list slot for it.
+
+Previously, callers wanting this property needed to call both vmx_add_*_msr()
+and vmx_write_*_msr() to cover both cases, and there are no callers which want
+the old behaviour of being a no-op if an entry already existed for the MSR.
+
+As a result of this API improvement, the default value for guest MSRs need not
+be 0, and the default for host MSRs need not be passed via hardware register.
+In practice, this cleans up the VPMU allocation logic, and avoids an MSR read
+as part of vcpu construction.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit ee7689b94ac7094b975ab4a023cfeae209da0a36)
+---
+ xen/arch/x86/cpu/vpmu_intel.c | 6 ++----
+ xen/arch/x86/hvm/vmx/vmcs.c | 14 +++++++-------
+ xen/arch/x86/hvm/vmx/vmx.c | 2 +-
+ xen/include/asm-x86/hvm/vmx/vmcs.h | 20 ++++++++++++++------
+ 4 files changed, 24 insertions(+), 18 deletions(-)
+
+diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
+index c499e69f2f..1fc79c9ff4 100644
+--- a/xen/arch/x86/cpu/vpmu_intel.c
++++ b/xen/arch/x86/cpu/vpmu_intel.c
+@@ -454,13 +454,11 @@ static int core2_vpmu_alloc_resource(struct vcpu *v)
+
+ if ( is_hvm_vcpu(v) )
+ {
+- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+- if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
++ if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
+ goto out_err;
+
+- if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
++ if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0) )
+ goto out_err;
+- vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) +
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index e86f292fbc..af422b3f92 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -1339,7 +1339,8 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+ return ((ent < end) && (ent->index == msr)) ? ent : NULL;
+ }
+
+-int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type)
++int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
++ enum vmx_msr_list_type type)
+ {
+ struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
+ struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
+@@ -1398,11 +1399,9 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type)
+ ent = locate_msr_entry(start, end, msr);
+
+ if ( (ent < end) && (ent->index == msr) )
+- {
+- rc = 0;
+- goto out;
+- }
++ goto found;
+
++ /* If there isn't an existing entry for msr, insert room for one. */
+ if ( total == (PAGE_SIZE / sizeof(*ent)) )
+ {
+ rc = -ENOSPC;
+@@ -1417,17 +1416,18 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type)
+ switch ( type )
+ {
+ case VMX_MSR_HOST:
+- rdmsrl(msr, ent->data);
+ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, ++vmx->host_msr_count);
+ break;
+
+ case VMX_MSR_GUEST:
+- ent->data = 0;
+ __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count);
+ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count);
+ break;
+ }
+
++ /* Set the msr's value. */
++ found:
++ ent->data = val;
+ rc = 0;
+
+ out:
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 5f01652d48..5745543e49 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -3142,7 +3142,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+
+ for ( i = 0; i < lbr->count; i++ )
+ {
+- int rc = vmx_add_guest_msr(v, lbr->base + i);
++ int rc = vmx_add_guest_msr(v, lbr->base + i, 0);
+
+ if ( unlikely(rc) )
+ {
+diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
+index 2c9e291bee..f94a108ea5 100644
+--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
+@@ -544,16 +544,24 @@ enum vmx_msr_list_type {
+ VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */
+ };
+
+-int vmx_add_msr(struct vcpu *v, uint32_t msr, enum vmx_msr_list_type type);
++/**
++ * Add an MSR to an MSR list (inserting space for the entry if necessary), and
++ * set the MSRs value.
++ *
++ * May fail if unable to allocate memory for the list, or the total number of
++ * entries exceeds the memory allocated.
++ */
++int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
++ enum vmx_msr_list_type type);
+
+-static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr)
++static inline int vmx_add_guest_msr(struct vcpu *v, uint32_t msr, uint64_t val)
+ {
+- return vmx_add_msr(v, msr, VMX_MSR_GUEST);
++ return vmx_add_msr(v, msr, val, VMX_MSR_GUEST);
+ }
+-
+-static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr)
++static inline int vmx_add_host_load_msr(struct vcpu *v, uint32_t msr,
++ uint64_t val)
+ {
+- return vmx_add_msr(v, msr, VMX_MSR_HOST);
++ return vmx_add_msr(v, msr, val, VMX_MSR_HOST);
+ }
+
+ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0027-x86-vmx-Support-load-only-guest-MSR-list-entries.patch b/emulators/xen-kernel411/files/0027-x86-vmx-Support-load-only-guest-MSR-list-entries.patch
new file mode 100644
index 000000000000..8a9c30497dfd
--- /dev/null
+++ b/emulators/xen-kernel411/files/0027-x86-vmx-Support-load-only-guest-MSR-list-entries.patch
@@ -0,0 +1,208 @@
+From 7b420e8a82cc8664e086ed31ec5e80615bd6225f Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 May 2018 11:57:00 +0100
+Subject: [PATCH 27/42] x86/vmx: Support load-only guest MSR list entries
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Currently, the VMX_MSR_GUEST type maintains completely symmetric guest load
+and save lists, by pointing VM_EXIT_MSR_STORE_ADDR and VM_ENTRY_MSR_LOAD_ADDR
+at the same page, and setting VM_EXIT_MSR_STORE_COUNT and
+VM_ENTRY_MSR_LOAD_COUNT to the same value.
+
+However, for MSRs which we won't let the guest have direct access to, having
+hardware save the current value on VMExit is unnecessary overhead.
+
+To avoid this overhead, we must make the load and save lists asymmetric. By
+making the entry load count greater than the exit store count, we can maintain
+two adjacent lists of MSRs, the first of which is saved and restored, and the
+second of which is only restored on VMEntry.
+
+For simplicity:
+ * Both adjacent lists are still sorted by MSR index.
+ * It undefined behaviour to insert the same MSR into both lists.
+ * The total size of both lists is still limited at 256 entries (one 4k page).
+
+Split the current msr_count field into msr_{load,save}_count, and introduce a
+new VMX_MSR_GUEST_LOADONLY type, and update vmx_{add,find}_msr() to calculate
+which sublist to search, based on type. VMX_MSR_HOST has no logical sublist,
+whereas VMX_MSR_GUEST has a sublist between 0 and the save count, while
+VMX_MSR_GUEST_LOADONLY has a sublist between the save count and the load
+count.
+
+One subtle point is that inserting an MSR into the load-save list involves
+moving the entire load-only list, and updating both counts.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+(cherry picked from commit 1ac46b55632626aeb935726e1b0a71605ef6763a)
+---
+ xen/arch/x86/hvm/vmx/vmcs.c | 46 +++++++++++++++++++++++-------
+ xen/arch/x86/hvm/vmx/vmx.c | 2 +-
+ xen/include/asm-x86/hvm/vmx/vmcs.h | 7 ++++-
+ 3 files changed, 43 insertions(+), 12 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index af422b3f92..ca652c49cb 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -1310,7 +1310,7 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+ {
+ const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
+ struct vmx_msr_entry *start = NULL, *ent, *end;
+- unsigned int total;
++ unsigned int substart, subend, total;
+
+ ASSERT(v == current || !vcpu_runnable(v));
+
+@@ -1318,12 +1318,23 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+ {
+ case VMX_MSR_HOST:
+ start = vmx->host_msr_area;
+- total = vmx->host_msr_count;
++ substart = 0;
++ subend = vmx->host_msr_count;
++ total = subend;
+ break;
+
+ case VMX_MSR_GUEST:
+ start = vmx->msr_area;
+- total = vmx->msr_count;
++ substart = 0;
++ subend = vmx->msr_save_count;
++ total = vmx->msr_load_count;
++ break;
++
++ case VMX_MSR_GUEST_LOADONLY:
++ start = vmx->msr_area;
++ substart = vmx->msr_save_count;
++ subend = vmx->msr_load_count;
++ total = subend;
+ break;
+
+ default:
+@@ -1334,7 +1345,7 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+ return NULL;
+
+ end = start + total;
+- ent = locate_msr_entry(start, end, msr);
++ ent = locate_msr_entry(start + substart, start + subend, msr);
+
+ return ((ent < end) && (ent->index == msr)) ? ent : NULL;
+ }
+@@ -1344,7 +1355,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+ {
+ struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
+ struct vmx_msr_entry **ptr, *start = NULL, *ent, *end;
+- unsigned int total;
++ unsigned int substart, subend, total;
+ int rc;
+
+ ASSERT(v == current || !vcpu_runnable(v));
+@@ -1353,12 +1364,23 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+ {
+ case VMX_MSR_HOST:
+ ptr = &vmx->host_msr_area;
+- total = vmx->host_msr_count;
++ substart = 0;
++ subend = vmx->host_msr_count;
++ total = subend;
+ break;
+
+ case VMX_MSR_GUEST:
+ ptr = &vmx->msr_area;
+- total = vmx->msr_count;
++ substart = 0;
++ subend = vmx->msr_save_count;
++ total = vmx->msr_load_count;
++ break;
++
++ case VMX_MSR_GUEST_LOADONLY:
++ ptr = &vmx->msr_area;
++ substart = vmx->msr_save_count;
++ subend = vmx->msr_load_count;
++ total = subend;
+ break;
+
+ default:
+@@ -1388,6 +1410,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+ break;
+
+ case VMX_MSR_GUEST:
++ case VMX_MSR_GUEST_LOADONLY:
+ __vmwrite(VM_EXIT_MSR_STORE_ADDR, addr);
+ __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, addr);
+ break;
+@@ -1396,7 +1419,7 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+
+ start = *ptr;
+ end = start + total;
+- ent = locate_msr_entry(start, end, msr);
++ ent = locate_msr_entry(start + substart, start + subend, msr);
+
+ if ( (ent < end) && (ent->index == msr) )
+ goto found;
+@@ -1420,8 +1443,11 @@ int vmx_add_msr(struct vcpu *v, uint32_t msr, uint64_t val,
+ break;
+
+ case VMX_MSR_GUEST:
+- __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_count);
+- __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_count);
++ __vmwrite(VM_EXIT_MSR_STORE_COUNT, ++vmx->msr_save_count);
++
++ /* Fallthrough */
++ case VMX_MSR_GUEST_LOADONLY:
++ __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, ++vmx->msr_load_count);
+ break;
+ }
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 5745543e49..1e32f61225 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -4200,7 +4200,7 @@ out:
+ static void lbr_tsx_fixup(void)
+ {
+ struct vcpu *curr = current;
+- unsigned int msr_count = curr->arch.hvm_vmx.msr_count;
++ unsigned int msr_count = curr->arch.hvm_vmx.msr_save_count;
+ struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
+ struct vmx_msr_entry *msr;
+
+diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
+index f94a108ea5..57e5098b99 100644
+--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
++++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
+@@ -139,7 +139,8 @@ struct arch_vmx_struct {
+ */
+ struct vmx_msr_entry *msr_area;
+ struct vmx_msr_entry *host_msr_area;
+- unsigned int msr_count;
++ unsigned int msr_load_count;
++ unsigned int msr_save_count;
+ unsigned int host_msr_count;
+
+ unsigned long eoi_exitmap_changed;
+@@ -542,12 +543,16 @@ enum vmx_insn_errno
+ enum vmx_msr_list_type {
+ VMX_MSR_HOST, /* MSRs loaded on VMExit. */
+ VMX_MSR_GUEST, /* MSRs saved on VMExit, loaded on VMEntry. */
++ VMX_MSR_GUEST_LOADONLY, /* MSRs loaded on VMEntry only. */
+ };
+
+ /**
+ * Add an MSR to an MSR list (inserting space for the entry if necessary), and
+ * set the MSRs value.
+ *
++ * It is undefined behaviour to try and insert the same MSR into both the
++ * GUEST and GUEST_LOADONLY list.
++ *
+ * May fail if unable to allocate memory for the list, or the total number of
+ * entries exceeds the memory allocated.
+ */
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0028-VMX-fix-vmx_-find-del-_msr-build.patch b/emulators/xen-kernel411/files/0028-VMX-fix-vmx_-find-del-_msr-build.patch
new file mode 100644
index 000000000000..39002c7343db
--- /dev/null
+++ b/emulators/xen-kernel411/files/0028-VMX-fix-vmx_-find-del-_msr-build.patch
@@ -0,0 +1,61 @@
+From 1d32c21975097e64a7ecf0932680a3b6d53d00a4 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Thu, 19 Jul 2018 11:54:45 +0200
+Subject: [PATCH 28/42] VMX: fix vmx_{find,del}_msr() build
+
+Older gcc at -O2 (and perhaps higher) does not recognize that apparently
+uninitialized variables aren't really uninitialized. Pull out the
+assignments used by two of the three case blocks and make them
+initializers of the variables, as I think I had suggested during review.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Acked-by: Kevin Tian <kevin.tian@intel.com>
+(cherry picked from commit 97cb0516a322ecdf0032fa9d8aa1525c03d7772f)
+---
+ xen/arch/x86/hvm/vmx/vmcs.c | 12 ++++--------
+ 1 file changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index ca652c49cb..30a33dd0bd 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -1310,7 +1310,8 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+ {
+ const struct arch_vmx_struct *vmx = &v->arch.hvm_vmx;
+ struct vmx_msr_entry *start = NULL, *ent, *end;
+- unsigned int substart, subend, total;
++ unsigned int substart = 0, subend = vmx->msr_save_count;
++ unsigned int total = vmx->msr_load_count;
+
+ ASSERT(v == current || !vcpu_runnable(v));
+
+@@ -1318,23 +1319,18 @@ struct vmx_msr_entry *vmx_find_msr(const struct vcpu *v, uint32_t msr,
+ {
+ case VMX_MSR_HOST:
+ start = vmx->host_msr_area;
+- substart = 0;
+ subend = vmx->host_msr_count;
+ total = subend;
+ break;
+
+ case VMX_MSR_GUEST:
+ start = vmx->msr_area;
+- substart = 0;
+- subend = vmx->msr_save_count;
+- total = vmx->msr_load_count;
+ break;
+
+ case VMX_MSR_GUEST_LOADONLY:
+ start = vmx->msr_area;
+- substart = vmx->msr_save_count;
+- subend = vmx->msr_load_count;
+- total = subend;
++ substart = subend;
++ subend = total;
+ break;
+
+ default:
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0029-ARM-disable-grant-table-v2.patch b/emulators/xen-kernel411/files/0029-ARM-disable-grant-table-v2.patch
new file mode 100644
index 000000000000..2e7005e88c2e
--- /dev/null
+++ b/emulators/xen-kernel411/files/0029-ARM-disable-grant-table-v2.patch
@@ -0,0 +1,66 @@
+From fa79f9e762be390b56218437ed317a695a03a5e7 Mon Sep 17 00:00:00 2001
+From: Stefano Stabellini <sstabellini@kernel.org>
+Date: Mon, 13 Aug 2018 17:25:51 +0100
+Subject: [PATCH 29/42] ARM: disable grant table v2
+
+It was never expected to work, the implementation is incomplete.
+
+As a side effect, it also prevents guests from triggering a
+"BUG_ON(page_get_owner(pg) != d)" in gnttab_unpopulate_status_frames().
+
+This is XSA-268.
+
+Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 9a5c16a3e75778c8a094ca87784d93b74676f46c)
+---
+ docs/misc/xen-command-line.markdown | 2 ++
+ xen/common/grant_table.c | 6 +++++-
+ xen/include/asm-arm/grant_table.h | 1 +
+ 3 files changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 3b710b71fb..e5e7fdc405 100644
+--- a/docs/misc/xen-command-line.markdown
++++ b/docs/misc/xen-command-line.markdown
+@@ -936,6 +936,8 @@ version are 1 and 2.
+ use of grant table v2 without transitive grants is an ABI breakage from the
+ guests point of view.
+
++The usage of gnttab v2 is not security supported on ARM platforms.
++
+ ### gnttab\_max\_frames
+ > `= <integer>`
+
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index c757b7f6f5..231ecf509a 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -97,7 +97,11 @@ static unsigned int __read_mostly max_maptrack_frames =
+ DEFAULT_MAX_MAPTRACK_FRAMES;
+ integer_runtime_param("gnttab_max_maptrack_frames", max_maptrack_frames);
+
+-static unsigned int __read_mostly opt_gnttab_max_version = 2;
++#ifndef GNTTAB_MAX_VERSION
++#define GNTTAB_MAX_VERSION 2
++#endif
++
++static unsigned int __read_mostly opt_gnttab_max_version = GNTTAB_MAX_VERSION;
+ static bool __read_mostly opt_transitive_grants = true;
+
+ static int __init parse_gnttab(const char *s)
+diff --git a/xen/include/asm-arm/grant_table.h b/xen/include/asm-arm/grant_table.h
+index e52936c79f..24958e4670 100644
+--- a/xen/include/asm-arm/grant_table.h
++++ b/xen/include/asm-arm/grant_table.h
+@@ -7,6 +7,7 @@
+ #include <xen/sched.h>
+
+ #define INITIAL_NR_GRANT_FRAMES 1U
++#define GNTTAB_MAX_VERSION 1
+
+ struct grant_table_arch {
+ gfn_t *shared_gfn;
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0030-x86-vtx-Fix-the-checking-for-unknown-invalid-MSR_DEB.patch b/emulators/xen-kernel411/files/0030-x86-vtx-Fix-the-checking-for-unknown-invalid-MSR_DEB.patch
new file mode 100644
index 000000000000..00ec3e3668bc
--- /dev/null
+++ b/emulators/xen-kernel411/files/0030-x86-vtx-Fix-the-checking-for-unknown-invalid-MSR_DEB.patch
@@ -0,0 +1,133 @@
+From 48fb482ef695c6b193ccfca665e6dd302eb230e2 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 13 Aug 2018 17:26:21 +0100
+Subject: [PATCH 30/42] x86/vtx: Fix the checking for unknown/invalid
+ MSR_DEBUGCTL bits
+
+The VPMU_MODE_OFF early-exit in vpmu_do_wrmsr() introduced by c/s
+11fe998e56 bypasses all reserved bit checking in the general case. As a
+result, a guest can enable BTS when it shouldn't be permitted to, and
+lock up the entire host.
+
+With vPMU active (not a security supported configuration, but useful for
+debugging), the reserved bit checking in broken, caused by the original
+BTS changeset 1a8aa75ed.
+
+From a correctness standpoint, it is not possible to have two different
+pieces of code responsible for different parts of value checking, if
+there isn't an accumulation of bits which have been checked. A
+practical upshot of this is that a guest can set any value it
+wishes (usually resulting in a vmentry failure for bad guest state).
+
+Therefore, fix this by implementing all the reserved bit checking in the
+main MSR_DEBUGCTL block, and removing all handling of DEBUGCTL from the
+vPMU MSR logic.
+
+This is XSA-269.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 2a8a8e99feb950504559196521bc9fd63ed3a962)
+---
+ xen/arch/x86/cpu/vpmu_intel.c | 20 --------------------
+ xen/arch/x86/hvm/vmx/vmx.c | 29 ++++++++++++++++++++---------
+ 2 files changed, 20 insertions(+), 29 deletions(-)
+
+diff --git a/xen/arch/x86/cpu/vpmu_intel.c b/xen/arch/x86/cpu/vpmu_intel.c
+index 1fc79c9ff4..6e27f6ec8e 100644
+--- a/xen/arch/x86/cpu/vpmu_intel.c
++++ b/xen/arch/x86/cpu/vpmu_intel.c
+@@ -533,27 +533,7 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+ uint64_t *enabled_cntrs;
+
+ if ( !core2_vpmu_msr_common_check(msr, &type, &index) )
+- {
+- /* Special handling for BTS */
+- if ( msr == MSR_IA32_DEBUGCTLMSR )
+- {
+- supported |= IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
+- IA32_DEBUGCTLMSR_BTINT;
+-
+- if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
+- supported |= IA32_DEBUGCTLMSR_BTS_OFF_OS |
+- IA32_DEBUGCTLMSR_BTS_OFF_USR;
+- if ( !(msr_content & ~supported) &&
+- vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
+- return 0;
+- if ( (msr_content & supported) &&
+- !vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
+- printk(XENLOG_G_WARNING
+- "%pv: Debug Store unsupported on this CPU\n",
+- current);
+- }
+ return -EINVAL;
+- }
+
+ ASSERT(!supported);
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 1e32f61225..c7cf3a8fbc 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -3038,11 +3038,14 @@ void vmx_vlapic_msr_changed(struct vcpu *v)
+ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ {
+ struct vcpu *v = current;
++ const struct cpuid_policy *cp = v->domain->arch.cpuid;
+
+ HVM_DBG_LOG(DBG_LEVEL_MSR, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content);
+
+ switch ( msr )
+ {
++ uint64_t rsvd;
++
+ case MSR_IA32_SYSENTER_CS:
+ __vmwrite(GUEST_SYSENTER_CS, msr_content);
+ break;
+@@ -3095,18 +3098,26 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+ wrmsrl(MSR_SYSCALL_MASK, msr_content);
+ break;
+
+- case MSR_IA32_DEBUGCTLMSR: {
+- uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF;
++ case MSR_IA32_DEBUGCTLMSR:
++ rsvd = ~(IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF);
+
+- if ( boot_cpu_has(X86_FEATURE_RTM) )
+- supported |= IA32_DEBUGCTLMSR_RTM;
+- if ( msr_content & ~supported )
++ /* TODO: Wire vPMU settings properly through the CPUID policy */
++ if ( vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_BTS) )
+ {
+- /* Perhaps some other bits are supported in vpmu. */
+- if ( vpmu_do_wrmsr(msr, msr_content, supported) )
+- break;
++ rsvd &= ~(IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS |
++ IA32_DEBUGCTLMSR_BTINT);
++
++ if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
++ rsvd &= ~(IA32_DEBUGCTLMSR_BTS_OFF_OS |
++ IA32_DEBUGCTLMSR_BTS_OFF_USR);
+ }
+
++ if ( cp->feat.rtm )
++ rsvd &= ~IA32_DEBUGCTLMSR_RTM;
++
++ if ( msr_content & rsvd )
++ goto gp_fault;
++
+ /*
+ * When a guest first enables LBR, arrange to save and restore the LBR
+ * MSRs and allow the guest direct access.
+@@ -3165,7 +3176,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+
+ __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
+ break;
+- }
++
+ case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ /* None of these MSRs are writeable. */
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0032-x86-spec-ctrl-Calculate-safe-PTE-addresses-for-L1TF-.patch b/emulators/xen-kernel411/files/0032-x86-spec-ctrl-Calculate-safe-PTE-addresses-for-L1TF-.patch
new file mode 100644
index 000000000000..1080d9c767f2
--- /dev/null
+++ b/emulators/xen-kernel411/files/0032-x86-spec-ctrl-Calculate-safe-PTE-addresses-for-L1TF-.patch
@@ -0,0 +1,313 @@
+From d044f6cc590c58178d87ad78f1859d1c7905ee0b Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Wed, 25 Jul 2018 12:10:19 +0000
+Subject: [PATCH 32/42] x86/spec-ctrl: Calculate safe PTE addresses for L1TF
+ mitigations
+
+Safe PTE addresses for L1TF mitigations are ones which are within the L1D
+address width (may be wider than reported in CPUID), and above the highest
+cacheable RAM/NVDIMM/BAR/etc.
+
+All logic here is best-effort heuristics, which should in practice be fine for
+most hardware. Future work will see about disentangling the SRAT handling
+further, as well as having L0 pass this information down to lower levels when
+virtualised.
+
+This is part of XSA-273 / CVE-2018-3620.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit b03a57c9383b32181e60add6b6de12b473652aa4)
+---
+ xen/arch/x86/setup.c | 12 +++
+ xen/arch/x86/spec_ctrl.c | 153 ++++++++++++++++++++++++++++++++
+ xen/arch/x86/srat.c | 8 +-
+ xen/common/efi/boot.c | 12 +++
+ xen/include/asm-x86/spec_ctrl.h | 7 ++
+ 5 files changed, 190 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
+index 66fd13f93a..3cd3e81b30 100644
+--- a/xen/arch/x86/setup.c
++++ b/xen/arch/x86/setup.c
+@@ -912,6 +912,18 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ /* Sanitise the raw E820 map to produce a final clean version. */
+ max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
+
++ if ( !efi_enabled(EFI_BOOT) )
++ {
++ /*
++ * Supplement the heuristics in l1tf_calculations() by assuming that
++ * anything referenced in the E820 may be cacheable.
++ */
++ l1tf_safe_maddr =
++ max(l1tf_safe_maddr,
++ ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
++ e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
++ }
++
+ /* Create a temporary copy of the E820 map. */
+ memcpy(&boot_e820, &e820, sizeof(e820));
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 32213ace86..fe15a58de0 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -50,6 +50,10 @@ bool __initdata bsp_delay_spec_ctrl;
+ uint8_t __read_mostly default_xen_spec_ctrl;
+ uint8_t __read_mostly default_spec_ctrl_flags;
+
++paddr_t __read_mostly l1tf_addr_mask, __read_mostly l1tf_safe_maddr;
++static bool __initdata cpu_has_bug_l1tf;
++static unsigned int __initdata l1d_maxphysaddr;
++
+ static int __init parse_bti(const char *s)
+ {
+ const char *ss;
+@@ -420,6 +424,153 @@ static bool __init should_use_eager_fpu(void)
+ }
+ }
+
++/* Calculate whether this CPU is vulnerable to L1TF. */
++static __init void l1tf_calculations(uint64_t caps)
++{
++ bool hit_default = false;
++
++ l1d_maxphysaddr = paddr_bits;
++
++ /* L1TF is only known to affect Intel Family 6 processors at this time. */
++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
++ boot_cpu_data.x86 == 6 )
++ {
++ switch ( boot_cpu_data.x86_model )
++ {
++ /*
++ * Core processors since at least Penryn are vulnerable.
++ */
++ case 0x17: /* Penryn */
++ case 0x1d: /* Dunnington */
++ cpu_has_bug_l1tf = true;
++ break;
++
++ case 0x1f: /* Auburndale / Havendale */
++ case 0x1e: /* Nehalem */
++ case 0x1a: /* Nehalem EP */
++ case 0x2e: /* Nehalem EX */
++ case 0x25: /* Westmere */
++ case 0x2c: /* Westmere EP */
++ case 0x2f: /* Westmere EX */
++ cpu_has_bug_l1tf = true;
++ l1d_maxphysaddr = 44;
++ break;
++
++ case 0x2a: /* SandyBridge */
++ case 0x2d: /* SandyBridge EP/EX */
++ case 0x3a: /* IvyBridge */
++ case 0x3e: /* IvyBridge EP/EX */
++ case 0x3c: /* Haswell */
++ case 0x3f: /* Haswell EX/EP */
++ case 0x45: /* Haswell D */
++ case 0x46: /* Haswell H */
++ case 0x3d: /* Broadwell */
++ case 0x47: /* Broadwell H */
++ case 0x4f: /* Broadwell EP/EX */
++ case 0x56: /* Broadwell D */
++ case 0x4e: /* Skylake M */
++ case 0x55: /* Skylake X */
++ case 0x5e: /* Skylake D */
++ case 0x66: /* Cannonlake */
++ case 0x67: /* Cannonlake? */
++ case 0x8e: /* Kabylake M */
++ case 0x9e: /* Kabylake D */
++ cpu_has_bug_l1tf = true;
++ l1d_maxphysaddr = 46;
++ break;
++
++ /*
++ * Atom processors are not vulnerable.
++ */
++ case 0x1c: /* Pineview */
++ case 0x26: /* Lincroft */
++ case 0x27: /* Penwell */
++ case 0x35: /* Cloverview */
++ case 0x36: /* Cedarview */
++ case 0x37: /* Baytrail / Valleyview (Silvermont) */
++ case 0x4d: /* Avaton / Rangely (Silvermont) */
++ case 0x4c: /* Cherrytrail / Brasswell */
++ case 0x4a: /* Merrifield */
++ case 0x5a: /* Moorefield */
++ case 0x5c: /* Goldmont */
++ case 0x5f: /* Denverton */
++ case 0x7a: /* Gemini Lake */
++ break;
++
++ /*
++ * Knights processors are not vulnerable.
++ */
++ case 0x57: /* Knights Landing */
++ case 0x85: /* Knights Mill */
++ break;
++
++ default:
++ /* Defer printk() until we've accounted for RDCL_NO. */
++ hit_default = true;
++ cpu_has_bug_l1tf = true;
++ break;
++ }
++ }
++
++ /* Any processor advertising RDCL_NO should be not vulnerable to L1TF. */
++ if ( caps & ARCH_CAPABILITIES_RDCL_NO )
++ cpu_has_bug_l1tf = false;
++
++ if ( cpu_has_bug_l1tf && hit_default )
++ printk("Unrecognised CPU model %#x - assuming vulnerable to L1TF\n",
++ boot_cpu_data.x86_model);
++
++ /*
++ * L1TF safe address heuristics. These apply to the real hardware we are
++ * running on, and are best-effort-only if Xen is virtualised.
++ *
++ * The address mask which the L1D cache uses, which might be wider than
++ * the CPUID-reported maxphysaddr.
++ */
++ l1tf_addr_mask = ((1ul << l1d_maxphysaddr) - 1) & PAGE_MASK;
++
++ /*
++ * To be safe, l1tf_safe_maddr must be above the highest cacheable entity
++ * in system physical address space. However, to preserve space for
++ * paged-out metadata, it should be as low as possible above the highest
++ * cacheable address, so as to require fewer high-order bits being set.
++ *
++ * These heuristics are based on some guesswork to improve the likelihood
++ * of safety in the common case, including Linux's L1TF mitigation of
++ * inverting all address bits in a non-present PTE.
++ *
++ * - If L1D is wider than CPUID (Nehalem and later mobile/desktop/low end
++ * server), setting any address bit beyond CPUID maxphysaddr guarantees
++ * to make the PTE safe. This case doesn't require all the high-order
++ * bits being set, and doesn't require any other source of information
++ * for safety.
++ *
++ * - If L1D is the same as CPUID (Pre-Nehalem, or high end server), we
++ * must sacrifice high order bits from the real address space for
++ * safety. Therefore, make a blind guess that there is nothing
++ * cacheable in the top quarter of physical address space.
++ *
++ * It is exceedingly unlikely for machines to be populated with this
++ * much RAM (likely 512G on pre-Nehalem, 16T on Nehalem/Westmere, 64T on
++ * Sandybridge and later) due to the sheer volume of DIMMs this would
++ * actually take.
++ *
++ * However, it is possible to find machines this large, so the "top
++ * quarter" guess is supplemented to push the limit higher if references
++ * to cacheable mappings (E820/SRAT/EFI/etc) are found above the top
++ * quarter boundary.
++ *
++ * Finally, this top quarter guess gives us a good chance of being safe
++ * when running virtualised (and the CPUID maxphysaddr hasn't been
++ * levelled for heterogeneous migration safety), where the safety
++ * consideration is still in terms of host details, but all E820/etc
++ * information is in terms of guest physical layout.
++ */
++ l1tf_safe_maddr = max(l1tf_safe_maddr, ((l1d_maxphysaddr > paddr_bits)
++ ? (1ul << paddr_bits)
++ : (3ul << (paddr_bits - 2))));
++}
++
+ int8_t __read_mostly opt_xpti = -1;
+
+ static __init void xpti_init_default(uint64_t caps)
+@@ -633,6 +784,8 @@ void __init init_speculation_mitigations(void)
+ else
+ setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
+
++ l1tf_calculations(caps);
++
+ print_details(thunk, caps);
+
+ /*
+diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
+index 166eb44fe2..2d70b45909 100644
+--- a/xen/arch/x86/srat.c
++++ b/xen/arch/x86/srat.c
+@@ -20,6 +20,7 @@
+ #include <xen/pfn.h>
+ #include <asm/e820.h>
+ #include <asm/page.h>
++#include <asm/spec_ctrl.h>
+
+ static struct acpi_table_slit *__read_mostly acpi_slit;
+
+@@ -284,6 +285,11 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
+ if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
+ return;
+
++ start = ma->base_address;
++ end = start + ma->length;
++ /* Supplement the heuristics in l1tf_calculations(). */
++ l1tf_safe_maddr = max(l1tf_safe_maddr, ROUNDUP(end, PAGE_SIZE));
++
+ if (num_node_memblks >= NR_NODE_MEMBLKS)
+ {
+ dprintk(XENLOG_WARNING,
+@@ -292,8 +298,6 @@ acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
+ return;
+ }
+
+- start = ma->base_address;
+- end = start + ma->length;
+ pxm = ma->proximity_domain;
+ if (srat_rev < 2)
+ pxm &= 0xff;
+diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
+index 64d12685d3..6be0b3986f 100644
+--- a/xen/common/efi/boot.c
++++ b/xen/common/efi/boot.c
+@@ -1304,6 +1304,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
+
+ #ifndef CONFIG_ARM /* TODO - runtime service support */
+
++#include <asm/spec_ctrl.h>
++
+ static bool __initdata efi_map_uc;
+
+ static int __init parse_efi_param(const char *s)
+@@ -1419,6 +1421,16 @@ void __init efi_init_memory(void)
+ desc->PhysicalStart, desc->PhysicalStart + len - 1,
+ desc->Type, desc->Attribute);
+
++ if ( (desc->Attribute & (EFI_MEMORY_WB | EFI_MEMORY_WT)) ||
++ (efi_bs_revision >= EFI_REVISION(2, 5) &&
++ (desc->Attribute & EFI_MEMORY_WP)) )
++ {
++ /* Supplement the heuristics in l1tf_calculations(). */
++ l1tf_safe_maddr =
++ max(l1tf_safe_maddr,
++ ROUNDUP(desc->PhysicalStart + len, PAGE_SIZE));
++ }
++
+ if ( !efi_enabled(EFI_RS) ||
+ (!(desc->Attribute & EFI_MEMORY_RUNTIME) &&
+ (!map_bs ||
+diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
+index fea82603ca..d7e8ed0f5f 100644
+--- a/xen/include/asm-x86/spec_ctrl.h
++++ b/xen/include/asm-x86/spec_ctrl.h
+@@ -38,6 +38,13 @@ extern int8_t opt_xpti;
+ #define OPT_XPTI_DOM0 0x01
+ #define OPT_XPTI_DOMU 0x02
+
++/*
++ * The L1D address mask, which might be wider than reported in CPUID, and the
++ * system physical address above which there are believed to be no cacheable
++ * memory regions, thus unable to leak data via the L1TF vulnerability.
++ */
++extern paddr_t l1tf_addr_mask, l1tf_safe_maddr;
++
+ static inline void init_shadow_spec_ctrl_state(void)
+ {
+ struct cpu_info *info = get_cpu_info();
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0033-x86-spec-ctrl-Introduce-an-option-to-control-L1TF-mi.patch b/emulators/xen-kernel411/files/0033-x86-spec-ctrl-Introduce-an-option-to-control-L1TF-mi.patch
new file mode 100644
index 000000000000..cb259006831c
--- /dev/null
+++ b/emulators/xen-kernel411/files/0033-x86-spec-ctrl-Introduce-an-option-to-control-L1TF-mi.patch
@@ -0,0 +1,226 @@
+From 57483c09ef4fe9489ec4214989a97949916fecc0 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 23 Jul 2018 13:46:10 +0000
+Subject: [PATCH 33/42] x86/spec-ctrl: Introduce an option to control L1TF
+ mitigation for PV guests
+
+Shadowing a PV guest is only available when shadow paging is compiled in.
+When shadow paging isn't available, guests can be crashed instead as
+mitigation from Xen's point of view.
+
+Ideally, dom0 would also be potentially-shadowed-by-default, but dom0 has
+never been shadowed before, and there are some stability issues under
+investigation.
+
+This is part of XSA-273 / CVE-2018-3620.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 66a4e986819a86ba66ca2fe9d925e62a4fd30114)
+---
+ docs/misc/xen-command-line.markdown | 24 ++++++++
+ xen/arch/x86/Kconfig | 1 +
+ xen/arch/x86/spec_ctrl.c | 89 ++++++++++++++++++++++++++++-
+ xen/include/asm-x86/spec_ctrl.h | 4 ++
+ 4 files changed, 115 insertions(+), 3 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index e5e7fdc405..763cc1d878 100644
+--- a/docs/misc/xen-command-line.markdown
++++ b/docs/misc/xen-command-line.markdown
+@@ -1546,6 +1546,30 @@ do; there may be other custom operating systems which do. If you're
+ certain you don't plan on having PV guests which use this feature,
+ turning it off can reduce the attack surface.
+
++### pv-l1tf (x86)
++> `= List of [ <bool>, dom0=<bool>, domu=<bool> ]`
++
++> Default: `false` on believed-unaffected hardware, or in pv-shim mode.
++> `domu` on believed-affected hardware.
++
++Mitigations for L1TF / XSA-273 / CVE-2018-3620 for PV guests.
++
++For backwards compatibility, we may not alter an architecturally-legitimate
++pagetable entry a PV guest chooses to write. We can however force such a
++guest into shadow mode so that Xen controls the PTEs which are reachable by
++the CPU pagewalk.
++
++Shadowing is performed at the point where a PV guest first tries to write an
++L1TF-vulnerable PTE. Therefore, a PV guest kernel which has been updated with
++its own L1TF mitigations will not trigger shadow mode if it is well behaved.
++
++If CONFIG\_SHADOW\_PAGING is not compiled in, this mitigation instead crashes
++the guest when an L1TF-vulnerable PTE is written, which still allows updated,
++well-behaved PV guests to run, despite Shadow being compiled out.
++
++In the pv-shim case, Shadow is expected to be compiled out, and a malicious
++guest kernel can only leak data from the shim Xen, rather than the host Xen.
++
+ ### pv-shim (x86)
+ > `= <boolean>`
+
+diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
+index f64fc56739..cfba4a708c 100644
+--- a/xen/arch/x86/Kconfig
++++ b/xen/arch/x86/Kconfig
+@@ -72,6 +72,7 @@ config SHADOW_PAGING
+ * Running HVM guests on hardware lacking hardware paging support
+ (First-generation Intel VT-x or AMD SVM).
+ * Live migration of PV guests.
++ * L1TF sidechannel mitigation for PV guests.
+
+ Under a small number of specific workloads, shadow paging may be
+ deliberately used as a performance optimisation.
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index fe15a58de0..7995e27218 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -23,6 +23,7 @@
+ #include <asm/microcode.h>
+ #include <asm/msr.h>
+ #include <asm/processor.h>
++#include <asm/pv/shim.h>
+ #include <asm/spec_ctrl.h>
+ #include <asm/spec_ctrl_asm.h>
+
+@@ -203,6 +204,55 @@ static int __init parse_spec_ctrl(const char *s)
+ }
+ custom_param("spec-ctrl", parse_spec_ctrl);
+
++int8_t __read_mostly opt_pv_l1tf = -1;
++
++static __init int parse_pv_l1tf(const char *s)
++{
++ const char *ss;
++ int val, rc = 0;
++
++ /* Inhibit the defaults as an explicit choice has been given. */
++ if ( opt_pv_l1tf == -1 )
++ opt_pv_l1tf = 0;
++
++ /* Interpret 'pv-l1tf' alone in its positive boolean form. */
++ if ( *s == '\0' )
++ opt_xpti = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
++
++ do {
++ ss = strchr(s, ',');
++ if ( !ss )
++ ss = strchr(s, '\0');
++
++ switch ( parse_bool(s, ss) )
++ {
++ case 0:
++ opt_pv_l1tf = 0;
++ break;
++
++ case 1:
++ opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
++ break;
++
++ default:
++ if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
++ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOM0) |
++ (val ? OPT_PV_L1TF_DOM0 : 0));
++ else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
++ opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) |
++ (val ? OPT_PV_L1TF_DOMU : 0));
++ else
++ rc = -EINVAL;
++ break;
++ }
++
++ s = ss + 1;
++ } while ( *ss );
++
++ return rc;
++}
++custom_param("pv-l1tf", parse_pv_l1tf);
++
+ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ {
+ unsigned int _7d0 = 0, e8b = 0, tmp;
+@@ -226,9 +276,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
+ (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "");
+
+- /* Compiled-in support which pertains to BTI mitigations. */
+- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
+- printk(" Compiled-in support: INDIRECT_THUNK\n");
++ /* Compiled-in support which pertains to mitigations. */
++ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
++ printk(" Compiled-in support:"
++#ifdef CONFIG_INDIRECT_THUNK
++ " INDIRECT_THUNK"
++#endif
++#ifdef CONFIG_SHADOW_PAGING
++ " SHADOW_PAGING"
++#endif
++ "\n");
+
+ /* Settings for Xen's protection, irrespective of guests. */
+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s\n",
+@@ -242,6 +299,13 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
+ opt_ibpb ? " IBPB" : "");
+
++ /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
++ if ( cpu_has_bug_l1tf || opt_pv_l1tf )
++ printk(" L1TF: believed%s vulnerable, maxphysaddr L1D %u, CPUID %u"
++ ", Safe address %"PRIx64"\n",
++ cpu_has_bug_l1tf ? "" : " not",
++ l1d_maxphysaddr, paddr_bits, l1tf_safe_maddr);
++
+ /*
+ * Alternatives blocks for protecting against and/or virtualising
+ * mitigation support for guests.
+@@ -263,6 +327,10 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s\n",
+ opt_xpti & OPT_XPTI_DOM0 ? "enabled" : "disabled",
+ opt_xpti & OPT_XPTI_DOMU ? "enabled" : "disabled");
++
++ printk(" PV L1TF shadowing: Dom0 %s, DomU %s\n",
++ opt_pv_l1tf & OPT_PV_L1TF_DOM0 ? "enabled" : "disabled",
++ opt_pv_l1tf & OPT_PV_L1TF_DOMU ? "enabled" : "disabled");
+ }
+
+ /* Calculate whether Retpoline is known-safe on this CPU. */
+@@ -786,6 +854,21 @@ void __init init_speculation_mitigations(void)
+
+ l1tf_calculations(caps);
+
++ /*
++ * By default, enable PV domU L1TF mitigations on all L1TF-vulnerable
++ * hardware, except when running in shim mode.
++ *
++ * In shim mode, SHADOW is expected to be compiled out, and a malicious
++ * guest kernel can only attack the shim Xen, not the host Xen.
++ */
++ if ( opt_pv_l1tf == -1 )
++ {
++ if ( pv_shim || !cpu_has_bug_l1tf )
++ opt_pv_l1tf = 0;
++ else
++ opt_pv_l1tf = OPT_PV_L1TF_DOMU;
++ }
++
+ print_details(thunk, caps);
+
+ /*
+diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
+index d7e8ed0f5f..cdf5737dc2 100644
+--- a/xen/include/asm-x86/spec_ctrl.h
++++ b/xen/include/asm-x86/spec_ctrl.h
+@@ -38,6 +38,10 @@ extern int8_t opt_xpti;
+ #define OPT_XPTI_DOM0 0x01
+ #define OPT_XPTI_DOMU 0x02
+
++extern int8_t opt_pv_l1tf;
++#define OPT_PV_L1TF_DOM0 0x01
++#define OPT_PV_L1TF_DOMU 0x02
++
+ /*
+ * The L1D address mask, which might be wider than reported in CPUID, and the
+ * system physical address above which there are believed to be no cacheable
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0034-x86-shadow-Infrastructure-to-force-a-PV-guest-into-s.patch b/emulators/xen-kernel411/files/0034-x86-shadow-Infrastructure-to-force-a-PV-guest-into-s.patch
new file mode 100644
index 000000000000..403fd8be1fa4
--- /dev/null
+++ b/emulators/xen-kernel411/files/0034-x86-shadow-Infrastructure-to-force-a-PV-guest-into-s.patch
@@ -0,0 +1,277 @@
+From 02d2c660935cfd6ff2438afb3892776dfc7db711 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Mon, 23 Jul 2018 07:11:40 +0100
+Subject: [PATCH 34/42] x86/shadow: Infrastructure to force a PV guest into
+ shadow mode
+
+To mitigate L1TF, we cannot alter an architecturally-legitimate PTE a PV guest
+chooses to write, but we can force the PV domain into shadow mode so Xen
+controls the PTEs which are reachable by the CPU pagewalk.
+
+Introduce new shadow mode, PG_SH_forced, and a tasklet to perform the
+transition. Later patches will introduce the logic to enable this mode at the
+appropriate time.
+
+To simplify vcpu cleanup, make tasklet_kill() idempotent with respect to
+tasklet_init(), which involves adding a helper to check for an uninitialised
+list head.
+
+This is part of XSA-273 / CVE-2018-3620.
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Tim Deegan <tim@xen.org>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit b76ec3946bf6caca2c3950b857c008bc8db6723f)
+---
+ xen/arch/x86/mm/paging.c | 2 ++
+ xen/arch/x86/mm/shadow/common.c | 36 +++++++++++++++++++++++++++++++++
+ xen/arch/x86/pv/domain.c | 5 +++++
+ xen/common/tasklet.c | 5 +++++
+ xen/include/asm-x86/domain.h | 7 +++++++
+ xen/include/asm-x86/paging.h | 4 ++++
+ xen/include/asm-x86/shadow.h | 32 +++++++++++++++++++++++++++++
+ xen/include/xen/list.h | 5 +++++
+ 8 files changed, 96 insertions(+)
+
+diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
+index 2b0445ffe9..dcee496eb0 100644
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -873,6 +873,8 @@ void paging_dump_domain_info(struct domain *d)
+ printk(" paging assistance: ");
+ if ( paging_mode_shadow(d) )
+ printk("shadow ");
++ if ( paging_mode_sh_forced(d) )
++ printk("forced ");
+ if ( paging_mode_hap(d) )
+ printk("hap ");
+ if ( paging_mode_refcounts(d) )
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index dd61b50eb7..fd42d734e7 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -3177,6 +3177,15 @@ static void sh_new_mode(struct domain *d, u32 new_mode)
+ ASSERT(paging_locked_by_me(d));
+ ASSERT(d != current->domain);
+
++ /*
++ * If PG_SH_forced has previously been activated because of writing an
++ * L1TF-vulnerable PTE, it must remain active for the remaining lifetime
++ * of the domain, even if the logdirty mode needs to be controlled for
++ * migration purposes.
++ */
++ if ( paging_mode_sh_forced(d) )
++ new_mode |= PG_SH_forced | PG_SH_enable;
++
+ d->arch.paging.mode = new_mode;
+ for_each_vcpu(d, v)
+ sh_update_paging_modes(v);
+@@ -4057,6 +4066,33 @@ void shadow_audit_tables(struct vcpu *v)
+
+ #endif /* Shadow audit */
+
++#ifdef CONFIG_PV
++
++void pv_l1tf_tasklet(unsigned long data)
++{
++ struct domain *d = (void *)data;
++
++ domain_pause(d);
++ paging_lock(d);
++
++ if ( !paging_mode_sh_forced(d) && !d->is_dying )
++ {
++ int ret = shadow_one_bit_enable(d, PG_SH_forced);
++
++ if ( ret )
++ {
++ printk(XENLOG_G_ERR "d%d Failed to enable PG_SH_forced: %d\n",
++ d->domain_id, ret);
++ domain_crash(d);
++ }
++ }
++
++ paging_unlock(d);
++ domain_unpause(d);
++}
++
++#endif /* CONFIG_PV */
++
+ /*
+ * Local variables:
+ * mode: C
+diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
+index a4f0bd239d..3230ac6a22 100644
+--- a/xen/arch/x86/pv/domain.c
++++ b/xen/arch/x86/pv/domain.c
+@@ -13,6 +13,7 @@
+ #include <asm/invpcid.h>
+ #include <asm/spec_ctrl.h>
+ #include <asm/pv/domain.h>
++#include <asm/shadow.h>
+
+ static __read_mostly enum {
+ PCID_OFF,
+@@ -209,6 +210,8 @@ int pv_vcpu_initialise(struct vcpu *v)
+
+ void pv_domain_destroy(struct domain *d)
+ {
++ pv_l1tf_domain_destroy(d);
++
+ destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
+ GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
+
+@@ -229,6 +232,8 @@ int pv_domain_initialise(struct domain *d)
+ };
+ int rc = -ENOMEM;
+
++ pv_l1tf_domain_init(d);
++
+ d->arch.pv_domain.gdt_ldt_l1tab =
+ alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
+ if ( !d->arch.pv_domain.gdt_ldt_l1tab )
+diff --git a/xen/common/tasklet.c b/xen/common/tasklet.c
+index 0f0a6f8365..d4fea3151c 100644
+--- a/xen/common/tasklet.c
++++ b/xen/common/tasklet.c
+@@ -156,6 +156,10 @@ void tasklet_kill(struct tasklet *t)
+
+ spin_lock_irqsave(&tasklet_lock, flags);
+
++ /* Cope with uninitialised tasklets. */
++ if ( list_head_is_null(&t->list) )
++ goto unlock;
++
+ if ( !list_empty(&t->list) )
+ {
+ BUG_ON(t->is_dead || t->is_running || (t->scheduled_on < 0));
+@@ -172,6 +176,7 @@ void tasklet_kill(struct tasklet *t)
+ spin_lock_irqsave(&tasklet_lock, flags);
+ }
+
++ unlock:
+ spin_unlock_irqrestore(&tasklet_lock, flags);
+ }
+
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index e0d413c7de..61e6900465 100644
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -121,6 +121,11 @@ struct shadow_domain {
+
+ /* Has this domain ever used HVMOP_pagetable_dying? */
+ bool_t pagetable_dying_op;
++
++#ifdef CONFIG_PV
++ /* PV L1 Terminal Fault mitigation. */
++ struct tasklet pv_l1tf_tasklet;
++#endif /* CONFIG_PV */
+ #endif
+ };
+
+@@ -257,6 +262,8 @@ struct pv_domain
+ bool xpti;
+ /* Use PCID feature? */
+ bool pcid;
++ /* Mitigate L1TF with shadow/crashing? */
++ bool check_l1tf;
+
+ /* map_domain_page() mapping cache. */
+ struct mapcache_domain mapcache;
+diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
+index f0085511c7..f440e3e53c 100644
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -37,11 +37,14 @@
+
+ #define PG_SH_shift 20
+ #define PG_HAP_shift 21
++#define PG_SHF_shift 22
+ /* We're in one of the shadow modes */
+ #ifdef CONFIG_SHADOW_PAGING
+ #define PG_SH_enable (1U << PG_SH_shift)
++#define PG_SH_forced (1U << PG_SHF_shift)
+ #else
+ #define PG_SH_enable 0
++#define PG_SH_forced 0
+ #endif
+ #define PG_HAP_enable (1U << PG_HAP_shift)
+
+@@ -62,6 +65,7 @@
+
+ #define paging_mode_enabled(_d) (!!(_d)->arch.paging.mode)
+ #define paging_mode_shadow(_d) (!!((_d)->arch.paging.mode & PG_SH_enable))
++#define paging_mode_sh_forced(_d) (!!((_d)->arch.paging.mode & PG_SH_forced))
+ #define paging_mode_hap(_d) (!!((_d)->arch.paging.mode & PG_HAP_enable))
+
+ #define paging_mode_refcounts(_d) (!!((_d)->arch.paging.mode & PG_refcounts))
+diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
+index 94a34fd16a..14afb7db52 100644
+--- a/xen/include/asm-x86/shadow.h
++++ b/xen/include/asm-x86/shadow.h
+@@ -29,6 +29,7 @@
+ #include <asm/flushtlb.h>
+ #include <asm/paging.h>
+ #include <asm/p2m.h>
++#include <asm/spec_ctrl.h>
+
+ /*****************************************************************************
+ * Macros to tell which shadow paging mode a domain is in*/
+@@ -115,6 +116,37 @@ static inline int shadow_domctl(struct domain *d,
+
+ #endif /* CONFIG_SHADOW_PAGING */
+
++/*
++ * Mitigations for L1TF / CVE-2018-3620 for PV guests.
++ *
++ * We cannot alter an architecturally-legitimate PTE which a PV guest has
++ * chosen to write, as traditional paged-out metadata is L1TF-vulnerable.
++ * What we can do is force a PV guest which writes a vulnerable PTE into
++ * shadow mode, so Xen controls the pagetables which are reachable by the CPU
++ * pagewalk.
++ */
++
++void pv_l1tf_tasklet(unsigned long data);
++
++static inline void pv_l1tf_domain_init(struct domain *d)
++{
++ d->arch.pv_domain.check_l1tf =
++ opt_pv_l1tf & (is_hardware_domain(d)
++ ? OPT_PV_L1TF_DOM0 : OPT_PV_L1TF_DOMU);
++
++#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
++ tasklet_init(&d->arch.paging.shadow.pv_l1tf_tasklet,
++ pv_l1tf_tasklet, (unsigned long)d);
++#endif
++}
++
++static inline void pv_l1tf_domain_destroy(struct domain *d)
++{
++#if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
++ tasklet_kill(&d->arch.paging.shadow.pv_l1tf_tasklet);
++#endif
++}
++
+ /* Remove all shadows of the guest mfn. */
+ static inline void shadow_remove_all_shadows(struct domain *d, mfn_t gmfn)
+ {
+diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h
+index fa07d720ee..1387abb211 100644
+--- a/xen/include/xen/list.h
++++ b/xen/include/xen/list.h
+@@ -51,6 +51,11 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
+ list->prev = list;
+ }
+
++static inline bool list_head_is_null(const struct list_head *list)
++{
++ return !list->next && !list->prev;
++}
++
+ /*
+ * Insert a new entry between two known consecutive entries.
+ *
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0035-x86-mm-Plumbing-to-allow-any-PTE-update-to-fail-with.patch b/emulators/xen-kernel411/files/0035-x86-mm-Plumbing-to-allow-any-PTE-update-to-fail-with.patch
new file mode 100644
index 000000000000..6a0845583624
--- /dev/null
+++ b/emulators/xen-kernel411/files/0035-x86-mm-Plumbing-to-allow-any-PTE-update-to-fail-with.patch
@@ -0,0 +1,255 @@
+From f4a049ede7ee9e1fafad6248cffc5e6deac1bc39 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 23 Jul 2018 08:11:40 +0200
+Subject: [PATCH 35/42] x86/mm: Plumbing to allow any PTE update to fail with
+ -ERESTART
+
+Switching to shadow mode is performed in tasklet context. To facilitate this,
+we schedule the tasklet, then create a hypercall continuation to allow the
+switch to take place.
+
+As a consequence, the x86 mm code needs to cope with an L1e operation being
+continuable. do_mmu{,ext}_op() may no longer assert that a continuation
+doesn't happen on the final iteration.
+
+To handle the arguments correctly on continuation, compat_update_va_mapping*()
+may no longer call into their non-compat counterparts. Move the compat
+functions into mm.c rather than exporting __do_update_va_mapping() and
+{get,put}_pg_owner(), and fix an unsigned long/int inconsistency with
+compat_update_va_mapping_otherdomain().
+
+This is part of XSA-273 / CVE-2018-3620.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit c612481d1c9232c6abf91b03ec655e92f808805f)
+---
+ xen/arch/x86/mm.c | 83 ++++++++++++++++++++++++++-------
+ xen/arch/x86/x86_64/compat/mm.c | 13 ------
+ xen/include/asm-x86/hypercall.h | 2 +-
+ 3 files changed, 66 insertions(+), 32 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index bcf46c0743..657af50c4c 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -613,6 +613,9 @@ static int alloc_segdesc_page(struct page_info *page)
+ return i == 512 ? 0 : -EINVAL;
+ }
+
++static int _get_page_type(struct page_info *page, unsigned long type,
++ bool preemptible);
++
+ static int get_page_and_type_from_mfn(
+ mfn_t mfn, unsigned long type, struct domain *d,
+ int partial, int preemptible)
+@@ -624,9 +627,7 @@ static int get_page_and_type_from_mfn(
+ unlikely(!get_page_from_mfn(mfn, d)) )
+ return -EINVAL;
+
+- rc = (preemptible ?
+- get_page_type_preemptible(page, type) :
+- (get_page_type(page, type) ? 0 : -EINVAL));
++ rc = _get_page_type(page, type, preemptible);
+
+ if ( unlikely(rc) && partial >= 0 &&
+ (!preemptible || page != current->arch.old_guest_table) )
+@@ -1456,8 +1457,7 @@ static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
+ return 1;
+ }
+
+-static int alloc_l2_table(struct page_info *page, unsigned long type,
+- int preemptible)
++static int alloc_l2_table(struct page_info *page, unsigned long type)
+ {
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+@@ -1469,8 +1469,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
+
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
+ {
+- if ( preemptible && i > page->nr_validated_ptes
+- && hypercall_preempt_check() )
++ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ rc = -ERESTART;
+@@ -1481,6 +1480,12 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
+ (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
+ continue;
+
++ if ( unlikely(rc == -ERESTART) )
++ {
++ page->nr_validated_ptes = i;
++ break;
++ }
++
+ if ( rc < 0 )
+ {
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
+@@ -1763,7 +1768,7 @@ static void free_l1_table(struct page_info *page)
+ }
+
+
+-static int free_l2_table(struct page_info *page, int preemptible)
++static int free_l2_table(struct page_info *page)
+ {
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+@@ -1777,7 +1782,7 @@ static int free_l2_table(struct page_info *page, int preemptible)
+ do {
+ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+ put_page_from_l2e(pl2e[i], pfn) == 0 &&
+- preemptible && i && hypercall_preempt_check() )
++ i && hypercall_preempt_check() )
+ {
+ page->nr_validated_ptes = i;
+ err = -ERESTART;
+@@ -2373,7 +2378,8 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
+ rc = alloc_l1_table(page);
+ break;
+ case PGT_l2_page_table:
+- rc = alloc_l2_table(page, type, preemptible);
++ ASSERT(preemptible);
++ rc = alloc_l2_table(page, type);
+ break;
+ case PGT_l3_page_table:
+ ASSERT(preemptible);
+@@ -2463,7 +2469,8 @@ int free_page_type(struct page_info *page, unsigned long type,
+ rc = 0;
+ break;
+ case PGT_l2_page_table:
+- rc = free_l2_table(page, preemptible);
++ ASSERT(preemptible);
++ rc = free_l2_table(page);
+ break;
+ case PGT_l3_page_table:
+ ASSERT(preemptible);
+@@ -3550,12 +3557,9 @@ long do_mmuext_op(
+ }
+
+ if ( rc == -ERESTART )
+- {
+- ASSERT(i < count);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi",
+ uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+- }
+ else if ( curr->arch.old_guest_table )
+ {
+ XEN_GUEST_HANDLE_PARAM(void) null;
+@@ -3861,12 +3865,9 @@ long do_mmu_update(
+ }
+
+ if ( rc == -ERESTART )
+- {
+- ASSERT(i < count);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi",
+ ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+- }
+ else if ( curr->arch.old_guest_table )
+ {
+ XEN_GUEST_HANDLE_PARAM(void) null;
+@@ -4121,7 +4122,13 @@ static int __do_update_va_mapping(
+ long do_update_va_mapping(unsigned long va, u64 val64,
+ unsigned long flags)
+ {
+- return __do_update_va_mapping(va, val64, flags, current->domain);
++ int rc = __do_update_va_mapping(va, val64, flags, current->domain);
++
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_update_va_mapping, "lll", va, val64, flags);
++
++ return rc;
+ }
+
+ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
+@@ -4138,6 +4145,46 @@ long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
+
+ put_pg_owner(pg_owner);
+
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_update_va_mapping_otherdomain,
++ "llli", va, val64, flags, domid);
++
++ return rc;
++}
++
++int compat_update_va_mapping(unsigned int va, uint32_t lo, uint32_t hi,
++ unsigned int flags)
++{
++ int rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo,
++ flags, current->domain);
++
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_update_va_mapping, "iiii", va, lo, hi, flags);
++
++ return rc;
++}
++
++int compat_update_va_mapping_otherdomain(unsigned int va,
++ uint32_t lo, uint32_t hi,
++ unsigned int flags, domid_t domid)
++{
++ struct domain *pg_owner;
++ int rc;
++
++ if ( (pg_owner = get_pg_owner(domid)) == NULL )
++ return -ESRCH;
++
++ rc = __do_update_va_mapping(va, ((uint64_t)hi << 32) | lo, flags, pg_owner);
++
++ put_pg_owner(pg_owner);
++
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_update_va_mapping_otherdomain,
++ "iiiii", va, lo, hi, flags, domid);
++
+ return rc;
+ }
+
+diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
+index c2aa6f2fdb..02bc75b91e 100644
+--- a/xen/arch/x86/x86_64/compat/mm.c
++++ b/xen/arch/x86/x86_64/compat/mm.c
+@@ -163,19 +163,6 @@ int compat_arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
+ return rc;
+ }
+
+-int compat_update_va_mapping(unsigned int va, u32 lo, u32 hi,
+- unsigned int flags)
+-{
+- return do_update_va_mapping(va, lo | ((u64)hi << 32), flags);
+-}
+-
+-int compat_update_va_mapping_otherdomain(unsigned long va, u32 lo, u32 hi,
+- unsigned long flags,
+- domid_t domid)
+-{
+- return do_update_va_mapping_otherdomain(va, lo | ((u64)hi << 32), flags, domid);
+-}
+-
+ DEFINE_XEN_GUEST_HANDLE(mmuext_op_compat_t);
+
+ int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(void) arg,
+diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h
+index 1cc2e37d5c..da38b7991c 100644
+--- a/xen/include/asm-x86/hypercall.h
++++ b/xen/include/asm-x86/hypercall.h
+@@ -165,7 +165,7 @@ extern int compat_update_va_mapping(
+ unsigned int va, u32 lo, u32 hi, unsigned int flags);
+
+ extern int compat_update_va_mapping_otherdomain(
+- unsigned long va, u32 lo, u32 hi, unsigned long flags, domid_t domid);
++ unsigned int va, u32 lo, u32 hi, unsigned int flags, domid_t domid);
+
+ DEFINE_XEN_GUEST_HANDLE(trap_info_compat_t);
+ extern int compat_set_trap_table(XEN_GUEST_HANDLE(trap_info_compat_t) traps);
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0036-x86-pv-Force-a-guest-into-shadow-mode-when-it-writes.patch b/emulators/xen-kernel411/files/0036-x86-pv-Force-a-guest-into-shadow-mode-when-it-writes.patch
new file mode 100644
index 000000000000..2508cb988df5
--- /dev/null
+++ b/emulators/xen-kernel411/files/0036-x86-pv-Force-a-guest-into-shadow-mode-when-it-writes.patch
@@ -0,0 +1,267 @@
+From 665e7685b4f5a683101ef833c45415e2548d873f Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Mon, 23 Jul 2018 08:11:40 +0200
+Subject: [PATCH 36/42] x86/pv: Force a guest into shadow mode when it writes
+ an L1TF-vulnerable PTE
+
+See the comment in shadow.h for an explanation of L1TF and the safety
+consideration of the PTEs.
+
+In the case that CONFIG_SHADOW_PAGING isn't compiled in, crash the domain
+instead. This allows well-behaved PV guests to function, while preventing
+L1TF from being exploited. (Note: PV guest kernels which haven't been updated
+with L1TF mitigations will likely be crashed as soon as they try paging a
+piece of userspace out to disk.)
+
+This is part of XSA-273 / CVE-2018-3620.
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Tim Deegan <tim@xen.org>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 06e8b622d3f3c0fa5075e91b041c6f45549ad70a)
+---
+ xen/arch/x86/mm.c | 22 ++++++--
+ xen/arch/x86/pv/ro-page-fault.c | 5 ++
+ xen/include/asm-x86/shadow.h | 94 +++++++++++++++++++++++++++++++++
+ xen/include/xen/tasklet.h | 5 ++
+ 4 files changed, 123 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 657af50c4c..7d4871b791 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -1116,7 +1116,7 @@ get_page_from_l2e(
+ int rc;
+
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+- return 1;
++ return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
+
+ if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
+ {
+@@ -1147,7 +1147,7 @@ get_page_from_l3e(
+ int rc;
+
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+- return 1;
++ return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
+
+ if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
+ {
+@@ -1180,7 +1180,7 @@ get_page_from_l4e(
+ int rc;
+
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+- return 1;
++ return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
+
+ if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
+ {
+@@ -1390,6 +1390,13 @@ static int alloc_l1_table(struct page_info *page)
+
+ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+ {
++ if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
++ {
++ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
++ if ( ret )
++ goto out;
++ }
++
+ switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
+ {
+ default:
+@@ -1410,6 +1417,7 @@ static int alloc_l1_table(struct page_info *page)
+
+ fail:
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
++ out:
+ while ( i-- > 0 )
+ put_page_from_l1e(pl1e[i], d);
+
+@@ -2060,6 +2068,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
+ rc = -EBUSY;
+ }
+ }
++ else if ( pv_l1tf_check_l1e(pt_dom, nl1e) )
++ return -ERESTART;
+ else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
+ preserve_ad)) )
+ {
+@@ -2123,6 +2133,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
+ rc = -EBUSY;
+ }
+ }
++ else if ( pv_l1tf_check_l2e(d, nl2e) )
++ return -ERESTART;
+ else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
+ preserve_ad)) )
+ {
+@@ -2184,6 +2196,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
+ rc = -EFAULT;
+ }
+ }
++ else if ( pv_l1tf_check_l3e(d, nl3e) )
++ return -ERESTART;
+ else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
+ preserve_ad)) )
+ {
+@@ -2249,6 +2263,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
+ rc = -EFAULT;
+ }
+ }
++ else if ( pv_l1tf_check_l4e(d, nl4e) )
++ return -ERESTART;
+ else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
+ preserve_ad)) )
+ {
+diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
+index aa8d5a7556..a3c0c2dd19 100644
+--- a/xen/arch/x86/pv/ro-page-fault.c
++++ b/xen/arch/x86/pv/ro-page-fault.c
+@@ -29,6 +29,7 @@
+ #include <asm/mm.h>
+ #include <asm/pci.h>
+ #include <asm/pv/mm.h>
++#include <asm/shadow.h>
+
+ #include "emulate.h"
+ #include "mm.h"
+@@ -129,6 +130,10 @@ static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old,
+
+ /* Check the new PTE. */
+ nl1e = l1e_from_intpte(val);
++
++ if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) )
++ return X86EMUL_RETRY;
++
+ switch ( ret = get_page_from_l1e(nl1e, d, d) )
+ {
+ default:
+diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
+index 14afb7db52..f40f411871 100644
+--- a/xen/include/asm-x86/shadow.h
++++ b/xen/include/asm-x86/shadow.h
+@@ -124,8 +124,102 @@ static inline int shadow_domctl(struct domain *d,
+ * What we can do is force a PV guest which writes a vulnerable PTE into
+ * shadow mode, so Xen controls the pagetables which are reachable by the CPU
+ * pagewalk.
++ *
++ * The core of the L1TF vulnerability is that the address bits of the PTE
++ * (accounting for PSE and factoring in the level-relevant part of the linear
++ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or
++ * eventual memory address) before the Present or reserved bits (which would
++ * cause a terminal fault) are accounted for. If an L1D hit occurs, the
++ * resulting data is available for potentially dependent instructions.
++ *
++ * For Present PTEs, the PV type-count safety logic ensures that the address
++ * bits always point at a guest-accessible frame, which is safe WRT L1TF from
++ * Xen's point of view. In practice, a PV guest should be unable to set any
++ * reserved bits, so should be unable to create any present L1TF-vulnerable
++ * PTEs at all.
++ *
++ * Therefore, these safety checks apply to Not-Present PTEs only, where
++ * traditionally, Xen would have let the guest write any value it chose.
++ *
++ * The all-zero PTE potentially leaks mfn 0. All software on the system is
++ * expected to cooperate and not put any secrets there. In a Xen system,
++ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains
++ * the real mode IVT and Bios Data Area. Therefore, mfn 0 is considered safe.
++ *
++ * Any PTE whose address is higher than the maximum cacheable address is safe,
++ * as it won't get an L1D hit.
++ *
++ * Speculative superpages also need accounting for, as PSE is considered
++ * irrespective of Present. We disallow PSE being set, as it allows an
++ * attacker to leak 2M or 1G of data starting from mfn 0. Also, because of
++ * recursive/linear pagetables, we must consider PSE even at L4, as hardware
++ * will interpret an L4e as an L3e during a recursive walk.
+ */
+
++static inline bool is_l1tf_safe_maddr(intpte_t pte)
++{
++ paddr_t maddr = pte & l1tf_addr_mask;
++
++ return maddr == 0 || maddr >= l1tf_safe_maddr;
++}
++
++static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level,
++ intpte_t pte)
++{
++ ASSERT(is_pv_domain(d));
++ ASSERT(!(pte & _PAGE_PRESENT));
++
++ if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) &&
++ (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) )
++ {
++#ifdef CONFIG_SHADOW_PAGING
++ struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet;
++
++ printk(XENLOG_G_WARNING
++ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n",
++ d->domain_id, level, pte);
++ /*
++ * Safety consideration for accessing tasklet.scheduled_on without the
++ * tasklet lock. This is a singleshot tasklet with the side effect of
++ * setting PG_SH_forced (checked just above). Multiple vcpus can race
++ * to schedule the tasklet, but if we observe it scheduled anywhere,
++ * that is good enough.
++ */
++ smp_rmb();
++ if ( !tasklet_is_scheduled(t) )
++ tasklet_schedule(t);
++#else
++ printk(XENLOG_G_ERR
++ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n",
++ d->domain_id, level, pte);
++ domain_crash(d);
++#endif
++ return true;
++ }
++
++ return false;
++}
++
++static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e)
++{
++ return pv_l1tf_check_pte(d, 1, l1e.l1);
++}
++
++static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e)
++{
++ return pv_l1tf_check_pte(d, 2, l2e.l2);
++}
++
++static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e)
++{
++ return pv_l1tf_check_pte(d, 3, l3e.l3);
++}
++
++static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e)
++{
++ return pv_l1tf_check_pte(d, 4, l4e.l4);
++}
++
+ void pv_l1tf_tasklet(unsigned long data);
+
+ static inline void pv_l1tf_domain_init(struct domain *d)
+diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h
+index 23d69c738e..bc9ddace6d 100644
+--- a/xen/include/xen/tasklet.h
++++ b/xen/include/xen/tasklet.h
+@@ -50,6 +50,11 @@ static inline bool tasklet_work_to_do(unsigned int cpu)
+ TASKLET_scheduled);
+ }
+
++static inline bool tasklet_is_scheduled(const struct tasklet *t)
++{
++ return t->scheduled_on != -1;
++}
++
+ void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu);
+ void tasklet_schedule(struct tasklet *t);
+ void do_tasklet(void);
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0037-x86-spec-ctrl-CPUID-MSR-definitions-for-L1D_FLUSH.patch b/emulators/xen-kernel411/files/0037-x86-spec-ctrl-CPUID-MSR-definitions-for-L1D_FLUSH.patch
new file mode 100644
index 000000000000..26232a95d89b
--- /dev/null
+++ b/emulators/xen-kernel411/files/0037-x86-spec-ctrl-CPUID-MSR-definitions-for-L1D_FLUSH.patch
@@ -0,0 +1,134 @@
+From fb78137bb82d3d8bcac36430b8bc331008ee3826 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Wed, 28 Mar 2018 15:21:39 +0100
+Subject: [PATCH 37/42] x86/spec-ctrl: CPUID/MSR definitions for L1D_FLUSH
+
+This is part of XSA-273 / CVE-2018-3646.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 3563fc2b2731a63fd7e8372ab0f5cef205bf8477)
+---
+ docs/misc/xen-command-line.markdown | 8 ++++----
+ tools/libxl/libxl_cpuid.c | 1 +
+ tools/misc/xen-cpuid.c | 2 +-
+ xen/arch/x86/cpuid.c | 5 +++++
+ xen/arch/x86/spec_ctrl.c | 4 +++-
+ xen/include/asm-x86/msr-index.h | 4 ++++
+ xen/include/public/arch-x86/cpufeatureset.h | 1 +
+ 7 files changed, 19 insertions(+), 6 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 763cc1d878..158b5bb919 100644
+--- a/docs/misc/xen-command-line.markdown
++++ b/docs/misc/xen-command-line.markdown
+@@ -489,10 +489,10 @@ accounting for hardware capabilities as enumerated via CPUID.
+
+ Currently accepted:
+
+-The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`, `ssbd` are
+-used by default if available and applicable. They can be ignored,
+-e.g. `no-ibrsb`, at which point Xen won't use them itself, and won't offer
+-them to guests.
++The Speculation Control hardware features `ibrsb`, `stibp`, `ibpb`,
++`l1d-flush` and `ssbd` are used by default if available and applicable. They can
++be ignored, e.g. `no-ibrsb`, at which point Xen won't use them itself, and
++won't offer them to guests.
+
+ ### cpuid\_mask\_cpu (AMD only)
+ > `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b`
+diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c
+index 7b0f594c3d..52e16c20ed 100644
+--- a/tools/libxl/libxl_cpuid.c
++++ b/tools/libxl/libxl_cpuid.c
+@@ -204,6 +204,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str)
+ {"avx512-4fmaps",0x00000007, 0, CPUID_REG_EDX, 3, 1},
+ {"ibrsb", 0x00000007, 0, CPUID_REG_EDX, 26, 1},
+ {"stibp", 0x00000007, 0, CPUID_REG_EDX, 27, 1},
++ {"l1d-flush", 0x00000007, 0, CPUID_REG_EDX, 28, 1},
+ {"arch-caps", 0x00000007, 0, CPUID_REG_EDX, 29, 1},
+ {"ssbd", 0x00000007, 0, CPUID_REG_EDX, 31, 1},
+
+diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c
+index e116339733..3888b4e158 100644
+--- a/tools/misc/xen-cpuid.c
++++ b/tools/misc/xen-cpuid.c
+@@ -143,7 +143,7 @@ static const char *str_7d0[32] =
+ [ 2] = "avx512_4vnniw", [ 3] = "avx512_4fmaps",
+
+ [26] = "ibrsb", [27] = "stibp",
+- /* 28 */ [29] = "arch_caps",
++ [28] = "l1d_flush", [29] = "arch_caps",
+ /* 30 */ [31] = "ssbd",
+ };
+
+diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
+index beee47d0ed..5cc89e2b34 100644
+--- a/xen/arch/x86/cpuid.c
++++ b/xen/arch/x86/cpuid.c
+@@ -43,6 +43,11 @@ static int __init parse_xen_cpuid(const char *s)
+ if ( !val )
+ setup_clear_cpu_cap(X86_FEATURE_STIBP);
+ }
++ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
++ {
++ if ( !val )
++ setup_clear_cpu_cap(X86_FEATURE_L1D_FLUSH);
++ }
+ else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 )
+ {
+ if ( !val )
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 7995e27218..9bcc2b6adc 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -266,14 +266,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ printk("Speculative mitigation facilities:\n");
+
+ /* Hardware features which pertain to speculative mitigations. */
+- printk(" Hardware features:%s%s%s%s%s%s%s%s\n",
++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n",
+ (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
+ (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "",
++ (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
+ (_7d0 & cpufeat_mask(X86_FEATURE_SSBD)) ? " SSBD" : "",
+ (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ? " IBPB" : "",
+ (caps & ARCH_CAPABILITIES_IBRS_ALL) ? " IBRS_ALL" : "",
+ (caps & ARCH_CAPABILITIES_RDCL_NO) ? " RDCL_NO" : "",
+ (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
++ (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "",
+ (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "");
+
+ /* Compiled-in support which pertains to mitigations. */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 8fbccc88a7..7235623c86 100644
+--- a/xen/include/asm-x86/msr-index.h
++++ b/xen/include/asm-x86/msr-index.h
+@@ -47,8 +47,12 @@
+ #define ARCH_CAPABILITIES_RDCL_NO (_AC(1, ULL) << 0)
+ #define ARCH_CAPABILITIES_IBRS_ALL (_AC(1, ULL) << 1)
+ #define ARCH_CAPS_RSBA (_AC(1, ULL) << 2)
++#define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3)
+ #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
+
++#define MSR_FLUSH_CMD 0x0000010b
++#define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
++
+ /* Intel MSRs. Some also available on other CPUs */
+ #define MSR_IA32_PERFCTR0 0x000000c1
+ #define MSR_IA32_A_PERFCTR0 0x000004c1
+diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
+index f1a5ed93e0..9f4c8246a9 100644
+--- a/xen/include/public/arch-x86/cpufeatureset.h
++++ b/xen/include/public/arch-x86/cpufeatureset.h
+@@ -244,6 +244,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions *
+ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single Precision */
+ XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */
+ XEN_CPUFEATURE(STIBP, 9*32+27) /*A STIBP */
++XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /* MSR_FLUSH_CMD and L1D flush. */
+ XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /* IA32_ARCH_CAPABILITIES MSR */
+ XEN_CPUFEATURE(SSBD, 9*32+31) /*A MSR_SPEC_CTRL.SSBD available */
+
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0038-x86-msr-Virtualise-MSR_FLUSH_CMD-for-guests.patch b/emulators/xen-kernel411/files/0038-x86-msr-Virtualise-MSR_FLUSH_CMD-for-guests.patch
new file mode 100644
index 000000000000..96f0e240eb33
--- /dev/null
+++ b/emulators/xen-kernel411/files/0038-x86-msr-Virtualise-MSR_FLUSH_CMD-for-guests.patch
@@ -0,0 +1,103 @@
+From 007752fb9b85b9235fe2820677988c6408c583da Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Fri, 13 Apr 2018 15:34:01 +0000
+Subject: [PATCH 38/42] x86/msr: Virtualise MSR_FLUSH_CMD for guests
+
+Guests (outside of the nested virt case, which isn't supported yet) don't need
+L1D_FLUSH for their L1TF mitigations, but offering/emulating MSR_FLUSH_CMD is
+easy and doesn't pose an issue for Xen.
+
+The MSR is offered to HVM guests only. PV guests attempting to use it would
+trap for emulation, and the L1D cache would fill long before the return to
+guest context. As such, PV guests can't make any use of the L1D_FLUSH
+functionality.
+
+This is part of XSA-273 / CVE-2018-3646.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit fd9823faf9df057a69a9a53c2e100691d3f4267c)
+---
+ xen/arch/x86/domctl.c | 3 ++-
+ xen/arch/x86/hvm/vmx/vmx.c | 6 ++++++
+ xen/arch/x86/msr.c | 12 ++++++++++++
+ xen/include/public/arch-x86/cpufeatureset.h | 2 +-
+ 4 files changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
+index fa82b6744e..dd91038a67 100644
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -225,7 +225,8 @@ static int update_domain_cpuid_info(struct domain *d,
+ */
+ call_policy_changed = (is_hvm_domain(d) &&
+ ((old_7d0 ^ p->feat.raw[0].d) &
+- cpufeat_mask(X86_FEATURE_IBRSB)));
++ (cpufeat_mask(X86_FEATURE_IBRSB) |
++ cpufeat_mask(X86_FEATURE_L1D_FLUSH))));
+ break;
+
+ case 0xa:
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index c7cf3a8fbc..b0fababede 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -583,6 +583,12 @@ static void vmx_cpuid_policy_changed(struct vcpu *v)
+ vmx_clear_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW);
+ else
+ vmx_set_msr_intercept(v, MSR_PRED_CMD, VMX_MSR_RW);
++
++ /* MSR_FLUSH_CMD is safe to pass through if the guest knows about it. */
++ if ( cp->feat.l1d_flush )
++ vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
++ else
++ vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
+ }
+
+ int vmx_guest_x86_mode(struct vcpu *v)
+diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
+index 1e12ccb729..1a591dd2b5 100644
+--- a/xen/arch/x86/msr.c
++++ b/xen/arch/x86/msr.c
+@@ -150,6 +150,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+ case MSR_AMD_PATCHLOADER:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_PRED_CMD:
++ case MSR_FLUSH_CMD:
+ /* Write-only */
+ goto gp_fault;
+
+@@ -254,6 +255,17 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+ wrmsrl(MSR_PRED_CMD, val);
+ break;
+
++ case MSR_FLUSH_CMD:
++ if ( !cp->feat.l1d_flush )
++ goto gp_fault; /* MSR available? */
++
++ if ( val & ~FLUSH_CMD_L1D )
++ goto gp_fault; /* Rsvd bit set? */
++
++ if ( v == curr )
++ wrmsrl(MSR_FLUSH_CMD, val);
++ break;
++
+ case MSR_INTEL_MISC_FEATURES_ENABLES:
+ {
+ bool old_cpuid_faulting = vp->misc_features_enables.cpuid_faulting;
+diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
+index 9f4c8246a9..6c82816fd3 100644
+--- a/xen/include/public/arch-x86/cpufeatureset.h
++++ b/xen/include/public/arch-x86/cpufeatureset.h
+@@ -244,7 +244,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions *
+ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single Precision */
+ XEN_CPUFEATURE(IBRSB, 9*32+26) /*A IBRS and IBPB support (used by Intel) */
+ XEN_CPUFEATURE(STIBP, 9*32+27) /*A STIBP */
+-XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /* MSR_FLUSH_CMD and L1D flush. */
++XEN_CPUFEATURE(L1D_FLUSH, 9*32+28) /*S MSR_FLUSH_CMD and L1D flush. */
+ XEN_CPUFEATURE(ARCH_CAPS, 9*32+29) /* IA32_ARCH_CAPABILITIES MSR */
+ XEN_CPUFEATURE(SSBD, 9*32+31) /*A MSR_SPEC_CTRL.SSBD available */
+
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0039-x86-spec-ctrl-Introduce-an-option-to-control-L1D_FLU.patch b/emulators/xen-kernel411/files/0039-x86-spec-ctrl-Introduce-an-option-to-control-L1D_FLU.patch
new file mode 100644
index 000000000000..e64e649d2759
--- /dev/null
+++ b/emulators/xen-kernel411/files/0039-x86-spec-ctrl-Introduce-an-option-to-control-L1D_FLU.patch
@@ -0,0 +1,188 @@
+From 2a47c7550910f5d591ca0de369234f8c18daa2d2 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Tue, 29 May 2018 18:44:16 +0100
+Subject: [PATCH 39/42] x86/spec-ctrl: Introduce an option to control L1D_FLUSH
+ for HVM HAP guests
+
+This mitigation requires up-to-date microcode, and is enabled by default on
+affected hardware if available, and is used for HVM guests
+
+The default for SMT/Hyperthreading is far more complicated to reason about,
+not least because we don't know if the user is going to want to run any HVM
+guests to begin with. If a explicit default isn't given, nag the user to
+perform a risk assessment and choose an explicit default, and leave other
+configuration to the toolstack.
+
+This is part of XSA-273 / CVE-2018-3620.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 3bd36952dab60290f33d6791070b57920e10754b)
+---
+ docs/misc/xen-command-line.markdown | 9 ++++++-
+ xen/arch/x86/hvm/vmx/vmcs.c | 5 ++++
+ xen/arch/x86/spec_ctrl.c | 38 +++++++++++++++++++++++++++--
+ xen/include/asm-x86/spec_ctrl.h | 1 +
+ 4 files changed, 50 insertions(+), 3 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 158b5bb919..57ef18194a 100644
+--- a/docs/misc/xen-command-line.markdown
++++ b/docs/misc/xen-command-line.markdown
+@@ -1791,7 +1791,8 @@ false disable the quirk workaround, which is also the default.
+
+ ### spec-ctrl (x86)
+ > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb}=<bool>,
+-> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu}=<bool> ]`
++> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
++> l1d-flush}=<bool> ]`
+
+ Controls for speculative execution sidechannel mitigations. By default, Xen
+ will pick the most appropriate mitigations based on compiled in support,
+@@ -1846,6 +1847,12 @@ from using fully eager FPU context switches. This is currently implemented as
+ a global control. By default, Xen will choose to use fully eager context
+ switches on hardware believed to speculate past #NM exceptions.
+
++On hardware supporting L1D_FLUSH, the `l1d-flush=` option can be used to force
++or prevent Xen from issuing an L1 data cache flush on each VMEntry.
++Irrespective of Xen's setting, the feature is virtualised for HVM guests to
++use. By default, Xen will enable this mitigation on hardware believed to be
++vulnerable to L1TF.
++
+ ### sync\_console
+ > `= <boolean>`
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 30a33dd0bd..2ba0c40808 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -38,6 +38,7 @@
+ #include <asm/flushtlb.h>
+ #include <asm/monitor.h>
+ #include <asm/shadow.h>
++#include <asm/spec_ctrl.h>
+ #include <asm/tboot.h>
+ #include <asm/apic.h>
+
+@@ -1274,6 +1275,10 @@ static int construct_vmcs(struct vcpu *v)
+
+ vmx_vlapic_msr_changed(v);
+
++ if ( opt_l1d_flush && paging_mode_hap(d) )
++ rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
++ VMX_MSR_GUEST_LOADONLY);
++
+ out:
+ vmx_vmcs_exit(v);
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 9bcc2b6adc..59baebb959 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -19,11 +19,13 @@
+ #include <xen/errno.h>
+ #include <xen/init.h>
+ #include <xen/lib.h>
++#include <xen/warning.h>
+
+ #include <asm/microcode.h>
+ #include <asm/msr.h>
+ #include <asm/processor.h>
+ #include <asm/pv/shim.h>
++#include <asm/setup.h>
+ #include <asm/spec_ctrl.h>
+ #include <asm/spec_ctrl_asm.h>
+
+@@ -46,6 +48,7 @@ static int8_t __initdata opt_ibrs = -1;
+ bool __read_mostly opt_ibpb = true;
+ bool __read_mostly opt_ssbd = false;
+ int8_t __read_mostly opt_eager_fpu = -1;
++int8_t __read_mostly opt_l1d_flush = -1;
+
+ bool __initdata bsp_delay_spec_ctrl;
+ uint8_t __read_mostly default_xen_spec_ctrl;
+@@ -139,6 +142,7 @@ static int __init parse_spec_ctrl(const char *s)
+ opt_ibrs = 0;
+ opt_ibpb = false;
+ opt_ssbd = false;
++ opt_l1d_flush = 0;
+ }
+ else if ( val > 0 )
+ rc = -EINVAL;
+@@ -194,6 +198,8 @@ static int __init parse_spec_ctrl(const char *s)
+ opt_ssbd = val;
+ else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
+ opt_eager_fpu = val;
++ else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
++ opt_l1d_flush = val;
+ else
+ rc = -EINVAL;
+
+@@ -290,7 +296,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ "\n");
+
+ /* Settings for Xen's protection, irrespective of guests. */
+- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s\n",
++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s\n",
+ thunk == THUNK_NONE ? "N/A" :
+ thunk == THUNK_RETPOLINE ? "RETPOLINE" :
+ thunk == THUNK_LFENCE ? "LFENCE" :
+@@ -299,7 +305,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-",
+ !boot_cpu_has(X86_FEATURE_SSBD) ? "" :
+ (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
+- opt_ibpb ? " IBPB" : "");
++ opt_ibpb ? " IBPB" : "",
++ opt_l1d_flush ? " L1D_FLUSH" : "");
+
+ /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
+ if ( cpu_has_bug_l1tf || opt_pv_l1tf )
+@@ -871,6 +878,33 @@ void __init init_speculation_mitigations(void)
+ opt_pv_l1tf = OPT_PV_L1TF_DOMU;
+ }
+
++ /*
++ * By default, enable L1D_FLUSH on L1TF-vulnerable hardware, unless
++ * instructed to skip the flush on vmentry by our outer hypervisor.
++ */
++ if ( !boot_cpu_has(X86_FEATURE_L1D_FLUSH) )
++ opt_l1d_flush = 0;
++ else if ( opt_l1d_flush == -1 )
++ opt_l1d_flush = cpu_has_bug_l1tf && !(caps & ARCH_CAPS_SKIP_L1DFL);
++
++ /*
++ * We do not disable HT by default on affected hardware.
++ *
++ * Firstly, if the user intends to use exclusively PV, or HVM shadow
++ * guests, HT isn't a concern and should remain fully enabled. Secondly,
++ * safety for HVM HAP guests can be arranged by the toolstack with core
++ * parking, pinning or cpupool configurations, including mixed setups.
++ *
++ * However, if we are on affected hardware, with HT enabled, and the user
++ * hasn't explicitly chosen whether to use HT or not, nag them to do so.
++ */
++ if ( opt_smt == -1 && cpu_has_bug_l1tf && !pv_shim &&
++ boot_cpu_data.x86_num_siblings > 1 )
++ warning_add(
++ "Booted on L1TF-vulnerable hardware with SMT/Hyperthreading\n"
++ "enabled. Please assess your configuration and choose an\n"
++ "explicit 'smt=<bool>' setting. See XSA-273.\n");
++
+ print_details(thunk, caps);
+
+ /*
+diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
+index cdf5737dc2..8f8aad40bb 100644
+--- a/xen/include/asm-x86/spec_ctrl.h
++++ b/xen/include/asm-x86/spec_ctrl.h
+@@ -29,6 +29,7 @@ void init_speculation_mitigations(void);
+ extern bool opt_ibpb;
+ extern bool opt_ssbd;
+ extern int8_t opt_eager_fpu;
++extern int8_t opt_l1d_flush;
+
+ extern bool bsp_delay_spec_ctrl;
+ extern uint8_t default_xen_spec_ctrl;
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0040-x86-Make-spec-ctrl-no-a-global-disable-of-all-mitiga.patch b/emulators/xen-kernel411/files/0040-x86-Make-spec-ctrl-no-a-global-disable-of-all-mitiga.patch
new file mode 100644
index 000000000000..57320614a0b7
--- /dev/null
+++ b/emulators/xen-kernel411/files/0040-x86-Make-spec-ctrl-no-a-global-disable-of-all-mitiga.patch
@@ -0,0 +1,69 @@
+From 6c7d074a4b5c8e69e21e505a04e7bb3f43658bea Mon Sep 17 00:00:00 2001
+From: Jan Beulich <JBeulich@suse.com>
+Date: Mon, 13 Aug 2018 05:07:23 -0600
+Subject: [PATCH 40/42] x86: Make "spec-ctrl=no" a global disable of all
+ mitigations
+
+In order to have a simple and easy to remember means to suppress all the
+more or less recent workarounds for hardware vulnerabilities, force
+settings not controlled by "spec-ctrl=" also to their original defaults,
+unless they've been forced to specific values already by earlier command
+line options.
+
+This is part of XSA-273.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+(cherry picked from commit d8800a82c3840b06b17672eddee4878bbfdacc6d)
+---
+ docs/misc/xen-command-line.markdown | 13 +++++++++----
+ xen/arch/x86/spec_ctrl.c | 9 +++++++++
+ 2 files changed, 18 insertions(+), 4 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 57ef18194a..0886706368 100644
+--- a/docs/misc/xen-command-line.markdown
++++ b/docs/misc/xen-command-line.markdown
+@@ -1804,10 +1804,15 @@ extreme care.**
+
+ An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
+ mitigations, including pieces of infrastructure used to virtualise certain
+-mitigation features for guests. Alternatively, a slightly more restricted
+-`spec-ctrl=no-xen` can be used to turn off all of Xen's mitigations, while
+-leaving the virtualisation support in place for guests to use. Use of a
+-positive boolean value for either of these options is invalid.
++mitigation features for guests. This also includes settings which `xpti`,
++`smt`, `pv-l1tf` control, unless the respective option(s) have been
++specified earlier on the command line.
++
++Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
++turn off all of Xen's mitigations, while leaving the virtualisation support
++in place for guests to use.
++
++Use of a positive boolean value for either of these options is invalid.
+
+ The booleans `pv=`, `hvm=`, `msr-sc=` and `rsb=` offer fine grained control
+ over the alternative blocks used by Xen. These impact Xen's ability to
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 59baebb959..f0c50d6703 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -134,6 +134,15 @@ static int __init parse_spec_ctrl(const char *s)
+
+ opt_eager_fpu = 0;
+
++ if ( opt_xpti < 0 )
++ opt_xpti = 0;
++
++ if ( opt_smt < 0 )
++ opt_smt = 1;
++
++ if ( opt_pv_l1tf < 0 )
++ opt_pv_l1tf = 0;
++
+ disable_common:
+ opt_rsb_pv = false;
+ opt_rsb_hvm = false;
+--
+2.18.0
+
diff --git a/emulators/xen-kernel411/files/0042-x86-write-to-correct-variable-in-parse_pv_l1tf.patch b/emulators/xen-kernel411/files/0042-x86-write-to-correct-variable-in-parse_pv_l1tf.patch
new file mode 100644
index 000000000000..4515a184d416
--- /dev/null
+++ b/emulators/xen-kernel411/files/0042-x86-write-to-correct-variable-in-parse_pv_l1tf.patch
@@ -0,0 +1,31 @@
+From 733450b39b83d7891ddd931399beef93e1edbf33 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Wed, 15 Aug 2018 14:20:24 +0200
+Subject: [PATCH 42/42] x86: write to correct variable in parse_pv_l1tf()
+
+Apparently a copy-and-paste mistake.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 57c554f8a6e06894f601d977d18b3017d2a60f40
+master date: 2018-08-15 14:15:30 +0200
+---
+ xen/arch/x86/spec_ctrl.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index f0c50d6703..c430b25b84 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -232,7 +232,7 @@ static __init int parse_pv_l1tf(const char *s)
+
+ /* Interpret 'pv-l1tf' alone in its positive boolean form. */
+ if ( *s == '\0' )
+- opt_xpti = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
++ opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
+
+ do {
+ ss = strchr(s, ',');
+--
+2.18.0
+