diff options
author | royger <royger@FreeBSD.org> | 2018-01-24 00:23:57 +0800 |
---|---|---|
committer | royger <royger@FreeBSD.org> | 2018-01-24 00:23:57 +0800 |
commit | eb65a9021abbcb4b1ee08b2926724a3caa227a59 (patch) | |
tree | f089cc1da5298a01dc1290f561b5d38a7cba6b97 | |
parent | 33ece822bf3b7326a7f0f3adafe553e523447a27 (diff) | |
download | freebsd-ports-gnome-eb65a9021abbcb4b1ee08b2926724a3caa227a59.tar.gz freebsd-ports-gnome-eb65a9021abbcb4b1ee08b2926724a3caa227a59.tar.zst freebsd-ports-gnome-eb65a9021abbcb4b1ee08b2926724a3caa227a59.zip |
xen-kernel: fix build with clang 6 and apply pending XSA patches
This includes a band-aid for running 64bit PV guests without
compromising the whole system.
MFH: 2018Q1
Sponsored by: Citrix Systems R&D
10 files changed, 1646 insertions, 0 deletions
diff --git a/emulators/xen-kernel/files/0001-p2m-Always-check-to-see-if-removing-a-p2m-entry-actu.patch b/emulators/xen-kernel/files/0001-p2m-Always-check-to-see-if-removing-a-p2m-entry-actu.patch new file mode 100644 index 000000000000..bba280c92641 --- /dev/null +++ b/emulators/xen-kernel/files/0001-p2m-Always-check-to-see-if-removing-a-p2m-entry-actu.patch @@ -0,0 +1,176 @@ +From f345ca185e0c042ed12bf929a9e93efaf33397bb Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Fri, 10 Nov 2017 16:53:54 +0000 +Subject: [PATCH 1/2] p2m: Always check to see if removing a p2m entry actually + worked + +The PoD zero-check functions speculatively remove memory from the p2m, +then check to see if it's completely zeroed, before putting it in the +cache. + +Unfortunately, the p2m_set_entry() calls may fail if the underlying +pagetable structure needs to change and the domain has exhausted its +p2m memory pool: for instance, if we're removing a 2MiB region out of +a 1GiB entry (in the p2m_pod_zero_check_superpage() case), or a 4k +region out of a 2MiB or larger entry (in the p2m_pod_zero_check() +case); and the return value is not checked. + +The underlying mfn will then be added into the PoD cache, and at some +point mapped into another location in the p2m. If the guest +afterwards ballons out this memory, it will be freed to the hypervisor +and potentially reused by another domain, in spite of the fact that +the original domain still has writable mappings to it. + +There are several places where p2m_set_entry() shouldn't be able to +fail, as it is guaranteed to write an entry of the same order that +succeeded before. Add a backstop of crashing the domain just in case, +and an ASSERT_UNREACHABLE() to flag up the broken assumption on debug +builds. + +While we're here, use PAGE_ORDER_2M rather than a magic constant. + +This is part of XSA-247. + +Reported-by: George Dunlap <george.dunlap.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- +v4: +- Removed some training whitespace +v3: +- Reformat reset clause to be more compact +- Make sure to set map[i] = NULL when unmapping in case we need to bail +v2: +- Crash a domain if a p2m_set_entry we think cannot fail fails anyway. +--- + xen/arch/x86/mm/p2m-pod.c | 77 +++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 61 insertions(+), 16 deletions(-) + +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index 87082cf65f..5ec8a37949 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -754,8 +754,10 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn) + } + + /* Try to remove the page, restoring old mapping if it fails. */ +- p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_2M, +- p2m_populate_on_demand, p2m->default_access); ++ if ( p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_2M, ++ p2m_populate_on_demand, p2m->default_access) ) ++ goto out; ++ + p2m_tlb_flush_sync(p2m); + + /* Make none of the MFNs are used elsewhere... for example, mapped +@@ -812,9 +814,18 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn) + ret = SUPERPAGE_PAGES; + + out_reset: +- if ( reset ) +- p2m_set_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access); +- ++ /* ++ * This p2m_set_entry() call shouldn't be able to fail, since the same order ++ * on the same gfn succeeded above. If that turns out to be false, crashing ++ * the domain should be the safest way of making sure we don't leak memory. ++ */ ++ if ( reset && p2m_set_entry(p2m, gfn, mfn0, PAGE_ORDER_2M, ++ type0, p2m->default_access) ) ++ { ++ ASSERT_UNREACHABLE(); ++ domain_crash(d); ++ } ++ + out: + gfn_unlock(p2m, gfn, SUPERPAGE_ORDER); + return ret; +@@ -871,19 +882,30 @@ p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count) + } + + /* Try to remove the page, restoring old mapping if it fails. */ +- p2m_set_entry(p2m, gfns[i], _mfn(INVALID_MFN), PAGE_ORDER_4K, +- p2m_populate_on_demand, p2m->default_access); ++ if ( p2m_set_entry(p2m, gfns[i], _mfn(INVALID_MFN), PAGE_ORDER_4K, ++ p2m_populate_on_demand, p2m->default_access) ) ++ goto skip; + + /* See if the page was successfully unmapped. (Allow one refcount + * for being allocated to a domain.) */ + if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 ) + { ++ /* ++ * If the previous p2m_set_entry call succeeded, this one shouldn't ++ * be able to fail. If it does, crashing the domain should be safe. ++ */ ++ if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, ++ types[i], p2m->default_access) ) ++ { ++ ASSERT_UNREACHABLE(); ++ domain_crash(d); ++ goto out_unmap; ++ } ++ ++ skip: + unmap_domain_page(map[i]); + map[i] = NULL; + +- p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, +- types[i], p2m->default_access); +- + continue; + } + } +@@ -902,12 +924,25 @@ p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count) + + unmap_domain_page(map[i]); + +- /* See comment in p2m_pod_zero_check_superpage() re gnttab +- * check timing. */ +- if ( j < PAGE_SIZE/sizeof(*map[i]) ) ++ map[i] = NULL; ++ ++ /* ++ * See comment in p2m_pod_zero_check_superpage() re gnttab ++ * check timing. ++ */ ++ if ( j < (PAGE_SIZE / sizeof(*map[i])) ) + { +- p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, +- types[i], p2m->default_access); ++ /* ++ * If the previous p2m_set_entry call succeeded, this one shouldn't ++ * be able to fail. If it does, crashing the domain should be safe. ++ */ ++ if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, ++ types[i], p2m->default_access) ) ++ { ++ ASSERT_UNREACHABLE(); ++ domain_crash(d); ++ goto out_unmap; ++ } + } + else + { +@@ -931,7 +966,17 @@ p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count) + p2m->pod.entry_count++; + } + } +- ++ ++ return; ++ ++out_unmap: ++ /* ++ * Something went wrong, probably crashing the domain. Unmap ++ * everything and return. ++ */ ++ for ( i = 0; i < count; i++ ) ++ if ( map[i] ) ++ unmap_domain_page(map[i]); + } + + #define POD_SWEEP_LIMIT 1024 +-- +2.15.0 + diff --git a/emulators/xen-kernel/files/0001-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch b/emulators/xen-kernel/files/0001-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch new file mode 100644 index 000000000000..97c93b30e1e2 --- /dev/null +++ b/emulators/xen-kernel/files/0001-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch @@ -0,0 +1,756 @@ +From e19517a3355acaaa2ff83018bc41e7fd044161e5 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 17 Jan 2018 17:24:12 +0100 +Subject: [PATCH 1/2] x86: Meltdown band-aid against malicious 64-bit PV guests + +This is a very simplistic change limiting the amount of memory a running +64-bit PV guest has mapped (and hence available for attacking): Only the +mappings of stack, IDT, and TSS are being cloned from the direct map +into per-CPU page tables. Guest controlled parts of the page tables are +being copied into those per-CPU page tables upon entry into the guest. +Cross-vCPU synchronization of top level page table entry changes is +being effected by forcing other active vCPU-s of the guest into the +hypervisor. + +The change to context_switch() isn't strictly necessary, but there's no +reason to keep switching page tables once a PV guest is being scheduled +out. + +This isn't providing full isolation yet, but it should be covering all +pieces of information exposure of which would otherwise require an XSA. + +There is certainly much room for improvement, especially of performance, +here - first and foremost suppressing all the negative effects on AMD +systems. But in the interest of backportability (including to really old +hypervisors, which may not even have alternative patching) any such is +being left out here. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 5784de3e2067ed73efc2fe42e62831e8ae7f46c4 +master date: 2018-01-16 17:49:03 +0100 +--- + xen/arch/x86/domain.c | 5 + + xen/arch/x86/mm.c | 17 ++++ + xen/arch/x86/smpboot.c | 198 +++++++++++++++++++++++++++++++++++++ + xen/arch/x86/x86_64/asm-offsets.c | 2 + + xen/arch/x86/x86_64/compat/entry.S | 11 +++ + xen/arch/x86/x86_64/entry.S | 149 +++++++++++++++++++++++++++- + xen/include/asm-x86/asm_defns.h | 30 ++++++ + xen/include/asm-x86/current.h | 12 +++ + xen/include/asm-x86/processor.h | 1 + + xen/include/asm-x86/x86_64/page.h | 5 +- + 10 files changed, 424 insertions(+), 6 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 6539b75fa7..3cf18f95b7 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1949,6 +1949,9 @@ static void paravirt_ctxt_switch_to(struct vcpu *v) + + switch_kernel_stack(v); + ++ this_cpu(root_pgt)[root_table_offset(PERDOMAIN_VIRT_START)] = ++ l4e_from_page(v->domain->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW); ++ + cr4 = pv_guest_cr4_to_real_cr4(v); + if ( unlikely(cr4 != read_cr4()) ) + write_cr4(cr4); +@@ -2096,6 +2099,8 @@ void context_switch(struct vcpu *prev, struct vcpu *next) + + ASSERT(local_irq_is_enabled()); + ++ get_cpu_info()->xen_cr3 = 0; ++ + cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask); + /* Allow at most one CPU at a time to be dirty. */ + ASSERT(cpumask_weight(&dirty_mask) <= 1); +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 50f500c940..c9e4003989 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -3857,6 +3857,7 @@ long do_mmu_update( + struct vcpu *curr = current, *v = curr; + struct domain *d = v->domain, *pt_owner = d, *pg_owner; + struct domain_mmap_cache mapcache; ++ bool_t sync_guest = 0; + uint32_t xsm_needed = 0; + uint32_t xsm_checked = 0; + int rc = put_old_guest_table(curr); +@@ -4005,6 +4006,8 @@ long do_mmu_update( + case PGT_l4_page_table: + rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, v); ++ if ( !rc ) ++ sync_guest = 1; + break; + case PGT_writable_page: + perfc_incr(writable_mmu_updates); +@@ -4107,6 +4110,20 @@ long do_mmu_update( + + domain_mmap_cache_destroy(&mapcache); + ++ if ( sync_guest ) ++ { ++ /* ++ * Force other vCPU-s of the affected guest to pick up L4 entry ++ * changes (if any). Issue a flush IPI with empty operation mask to ++ * facilitate this (including ourselves waiting for the IPI to ++ * actually have arrived). Utilize the fact that FLUSH_VA_VALID is ++ * meaningless without FLUSH_CACHE, but will allow to pass the no-op ++ * check in flush_area_mask(). ++ */ ++ flush_area_mask(pt_owner->domain_dirty_cpumask, ++ ZERO_BLOCK_PTR, FLUSH_VA_VALID); ++ } ++ + perfc_add(num_page_updates, i); + + out: +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index f9e4ee85ff..eaeec5acf0 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -319,6 +319,9 @@ void start_secondary(void *unused) + */ + spin_debug_disable(); + ++ get_cpu_info()->xen_cr3 = 0; ++ get_cpu_info()->pv_cr3 = __pa(this_cpu(root_pgt)); ++ + load_system_tables(); + + /* Full exception support from here on in. */ +@@ -628,6 +631,187 @@ void cpu_exit_clear(unsigned int cpu) + set_cpu_state(CPU_STATE_DEAD); + } + ++static int clone_mapping(const void *ptr, root_pgentry_t *rpt) ++{ ++ unsigned long linear = (unsigned long)ptr, pfn; ++ unsigned int flags; ++ l3_pgentry_t *pl3e = l4e_to_l3e(idle_pg_table[root_table_offset(linear)]) + ++ l3_table_offset(linear); ++ l2_pgentry_t *pl2e; ++ l1_pgentry_t *pl1e; ++ ++ if ( linear < DIRECTMAP_VIRT_START ) ++ return 0; ++ ++ flags = l3e_get_flags(*pl3e); ++ ASSERT(flags & _PAGE_PRESENT); ++ if ( flags & _PAGE_PSE ) ++ { ++ pfn = (l3e_get_pfn(*pl3e) & ~((1UL << (2 * PAGETABLE_ORDER)) - 1)) | ++ (PFN_DOWN(linear) & ((1UL << (2 * PAGETABLE_ORDER)) - 1)); ++ flags &= ~_PAGE_PSE; ++ } ++ else ++ { ++ pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(linear); ++ flags = l2e_get_flags(*pl2e); ++ ASSERT(flags & _PAGE_PRESENT); ++ if ( flags & _PAGE_PSE ) ++ { ++ pfn = (l2e_get_pfn(*pl2e) & ~((1UL << PAGETABLE_ORDER) - 1)) | ++ (PFN_DOWN(linear) & ((1UL << PAGETABLE_ORDER) - 1)); ++ flags &= ~_PAGE_PSE; ++ } ++ else ++ { ++ pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(linear); ++ flags = l1e_get_flags(*pl1e); ++ if ( !(flags & _PAGE_PRESENT) ) ++ return 0; ++ pfn = l1e_get_pfn(*pl1e); ++ } ++ } ++ ++ if ( !(root_get_flags(rpt[root_table_offset(linear)]) & _PAGE_PRESENT) ) ++ { ++ pl3e = alloc_xen_pagetable(); ++ if ( !pl3e ) ++ return -ENOMEM; ++ clear_page(pl3e); ++ l4e_write(&rpt[root_table_offset(linear)], ++ l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR)); ++ } ++ else ++ pl3e = l4e_to_l3e(rpt[root_table_offset(linear)]); ++ ++ pl3e += l3_table_offset(linear); ++ ++ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) ++ { ++ pl2e = alloc_xen_pagetable(); ++ if ( !pl2e ) ++ return -ENOMEM; ++ clear_page(pl2e); ++ l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR)); ++ } ++ else ++ { ++ ASSERT(!(l3e_get_flags(*pl3e) & _PAGE_PSE)); ++ pl2e = l3e_to_l2e(*pl3e); ++ } ++ ++ pl2e += l2_table_offset(linear); ++ ++ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) ++ { ++ pl1e = alloc_xen_pagetable(); ++ if ( !pl1e ) ++ return -ENOMEM; ++ clear_page(pl1e); ++ l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR)); ++ } ++ else ++ { ++ ASSERT(!(l2e_get_flags(*pl2e) & _PAGE_PSE)); ++ pl1e = l2e_to_l1e(*pl2e); ++ } ++ ++ pl1e += l1_table_offset(linear); ++ ++ if ( l1e_get_flags(*pl1e) & _PAGE_PRESENT ) ++ { ++ ASSERT(l1e_get_pfn(*pl1e) == pfn); ++ ASSERT(l1e_get_flags(*pl1e) == flags); ++ } ++ else ++ l1e_write(pl1e, l1e_from_pfn(pfn, flags)); ++ ++ return 0; ++} ++ ++DEFINE_PER_CPU(root_pgentry_t *, root_pgt); ++ ++static int setup_cpu_root_pgt(unsigned int cpu) ++{ ++ root_pgentry_t *rpt = alloc_xen_pagetable(); ++ unsigned int off; ++ int rc; ++ ++ if ( !rpt ) ++ return -ENOMEM; ++ ++ clear_page(rpt); ++ per_cpu(root_pgt, cpu) = rpt; ++ ++ rpt[root_table_offset(RO_MPT_VIRT_START)] = ++ idle_pg_table[root_table_offset(RO_MPT_VIRT_START)]; ++ /* SH_LINEAR_PT inserted together with guest mappings. */ ++ /* PERDOMAIN inserted during context switch. */ ++ rpt[root_table_offset(XEN_VIRT_START)] = ++ idle_pg_table[root_table_offset(XEN_VIRT_START)]; ++ ++ /* Install direct map page table entries for stack, IDT, and TSS. */ ++ for ( off = rc = 0; !rc && off < STACK_SIZE; off += PAGE_SIZE ) ++ rc = clone_mapping(__va(__pa(stack_base[cpu])) + off, rpt); ++ ++ if ( !rc ) ++ rc = clone_mapping(idt_tables[cpu], rpt); ++ if ( !rc ) ++ rc = clone_mapping(&per_cpu(init_tss, cpu), rpt); ++ ++ return rc; ++} ++ ++static void cleanup_cpu_root_pgt(unsigned int cpu) ++{ ++ root_pgentry_t *rpt = per_cpu(root_pgt, cpu); ++ unsigned int r; ++ ++ if ( !rpt ) ++ return; ++ ++ per_cpu(root_pgt, cpu) = NULL; ++ ++ for ( r = root_table_offset(DIRECTMAP_VIRT_START); ++ r < root_table_offset(HYPERVISOR_VIRT_END); ++r ) ++ { ++ l3_pgentry_t *l3t; ++ unsigned int i3; ++ ++ if ( !(root_get_flags(rpt[r]) & _PAGE_PRESENT) ) ++ continue; ++ ++ l3t = l4e_to_l3e(rpt[r]); ++ ++ for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; ++i3 ) ++ { ++ l2_pgentry_t *l2t; ++ unsigned int i2; ++ ++ if ( !(l3e_get_flags(l3t[i3]) & _PAGE_PRESENT) ) ++ continue; ++ ++ ASSERT(!(l3e_get_flags(l3t[i3]) & _PAGE_PSE)); ++ l2t = l3e_to_l2e(l3t[i3]); ++ ++ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; ++i2 ) ++ { ++ if ( !(l2e_get_flags(l2t[i2]) & _PAGE_PRESENT) ) ++ continue; ++ ++ ASSERT(!(l2e_get_flags(l2t[i2]) & _PAGE_PSE)); ++ free_xen_pagetable(l2e_to_l1e(l2t[i2])); ++ } ++ ++ free_xen_pagetable(l2t); ++ } ++ ++ free_xen_pagetable(l3t); ++ } ++ ++ free_xen_pagetable(rpt); ++} ++ + static void cpu_smpboot_free(unsigned int cpu) + { + unsigned int order, socket = cpu_to_socket(cpu); +@@ -664,6 +848,8 @@ static void cpu_smpboot_free(unsigned int cpu) + free_domheap_page(mfn_to_page(mfn)); + } + ++ cleanup_cpu_root_pgt(cpu); ++ + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); + free_xenheap_pages(per_cpu(gdt_table, cpu), order); + +@@ -719,6 +905,9 @@ static int cpu_smpboot_alloc(unsigned int cpu) + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); + ++ if ( setup_cpu_root_pgt(cpu) ) ++ goto oom; ++ + for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); + i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) + if ( cpu_online(i) && cpu_to_node(i) == node ) +@@ -773,6 +962,8 @@ static struct notifier_block cpu_smpboot_nfb = { + + void __init smp_prepare_cpus(unsigned int max_cpus) + { ++ int rc; ++ + register_cpu_notifier(&cpu_smpboot_nfb); + + mtrr_aps_sync_begin(); +@@ -786,6 +977,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) + + stack_base[0] = stack_start; + ++ rc = setup_cpu_root_pgt(0); ++ if ( rc ) ++ panic("Error %d setting up PV root page table\n", rc); ++ get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0)); ++ + set_nr_sockets(); + + socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets); +@@ -850,6 +1046,8 @@ void __init smp_prepare_boot_cpu(void) + { + cpumask_set_cpu(smp_processor_id(), &cpu_online_map); + cpumask_set_cpu(smp_processor_id(), &cpu_present_map); ++ ++ get_cpu_info()->xen_cr3 = 0; + } + + static void +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index a3ae7a475f..4f2ba28520 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -137,6 +137,8 @@ void __dummy__(void) + OFFSET(CPUINFO_processor_id, struct cpu_info, processor_id); + OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_cr4, struct cpu_info, cr4); ++ OFFSET(CPUINFO_xen_cr3, struct cpu_info, xen_cr3); ++ OFFSET(CPUINFO_pv_cr3, struct cpu_info, pv_cr3); + DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); + BLANK(); + +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 7ee01597a3..f7e53fb3cb 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -270,6 +270,17 @@ ENTRY(cstar_enter) + pushq $0 + movl $TRAP_syscall, 4(%rsp) + SAVE_ALL ++ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx ++ neg %rcx ++ jz .Lcstar_cr3_okay ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++ neg %rcx ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++.Lcstar_cr3_okay: ++ + GET_CURRENT(bx) + movq VCPU_domain(%rbx),%rcx + cmpb $0,DOMAIN_is_32bit_pv(%rcx) +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index cebb1e4f4f..d63e734bb3 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -36,6 +36,32 @@ ENTRY(switch_to_kernel) + /* %rbx: struct vcpu, interrupts disabled */ + restore_all_guest: + ASSERT_INTERRUPTS_DISABLED ++ ++ /* Copy guest mappings and switch to per-CPU root page table. */ ++ mov %cr3, %r9 ++ GET_STACK_END(dx) ++ mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi ++ movabs $PADDR_MASK & PAGE_MASK, %rsi ++ movabs $DIRECTMAP_VIRT_START, %rcx ++ mov %rdi, %rax ++ and %rsi, %rdi ++ and %r9, %rsi ++ add %rcx, %rdi ++ add %rcx, %rsi ++ mov $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx ++ mov root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8 ++ mov %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi) ++ rep movsq ++ mov $ROOT_PAGETABLE_ENTRIES - \ ++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1, %ecx ++ sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \ ++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rsi ++ sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \ ++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi ++ rep movsq ++ mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx) ++ write_cr3 rax, rdi, rsi ++ + RESTORE_ALL + testw $TRAP_syscall,4(%rsp) + jz iret_exit_to_guest +@@ -70,6 +96,22 @@ iret_exit_to_guest: + ALIGN + /* No special register assumptions. */ + restore_all_xen: ++ /* ++ * Check whether we need to switch to the per-CPU page tables, in ++ * case we return to late PV exit code (from an NMI or #MC). ++ */ ++ GET_STACK_END(ax) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rax), %rdx ++ mov STACK_CPUINFO_FIELD(pv_cr3)(%rax), %rax ++ test %rdx, %rdx ++ /* ++ * Ideally the condition would be "nsz", but such doesn't exist, ++ * so "g" will have to do. ++ */ ++UNLIKELY_START(g, exit_cr3) ++ write_cr3 rax, rdi, rsi ++UNLIKELY_END(exit_cr3) ++ + RESTORE_ALL adj=8 + iretq + +@@ -99,7 +141,18 @@ ENTRY(lstar_enter) + pushq $0 + movl $TRAP_syscall, 4(%rsp) + SAVE_ALL +- GET_CURRENT(bx) ++ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx ++ neg %rcx ++ jz .Llstar_cr3_okay ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++ neg %rcx ++ write_cr3 rcx, r11, r12 ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++.Llstar_cr3_okay: ++ ++ __GET_CURRENT(bx) + testb $TF_kernel_mode,VCPU_thread_flags(%rbx) + jz switch_to_kernel + +@@ -248,7 +301,18 @@ GLOBAL(sysenter_eflags_saved) + pushq $0 + movl $TRAP_syscall, 4(%rsp) + SAVE_ALL +- GET_CURRENT(bx) ++ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx ++ neg %rcx ++ jz .Lsyse_cr3_okay ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++ neg %rcx ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++.Lsyse_cr3_okay: ++ ++ __GET_CURRENT(bx) + cmpb $0,VCPU_sysenter_disables_events(%rbx) + movq VCPU_sysenter_addr(%rbx),%rax + setne %cl +@@ -284,13 +348,23 @@ ENTRY(int80_direct_trap) + movl $0x80, 4(%rsp) + SAVE_ALL + ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx ++ neg %rcx ++ jz .Lint80_cr3_okay ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++ neg %rcx ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx) ++.Lint80_cr3_okay: ++ + cmpb $0,untrusted_msi(%rip) + UNLIKELY_START(ne, msi_check) + movl $0x80,%edi + call check_for_unexpected_msi + UNLIKELY_END(msi_check) + +- GET_CURRENT(bx) ++ __GET_CURRENT(bx) + + /* Check that the callback is non-null. */ + leaq VCPU_int80_bounce(%rbx),%rdx +@@ -441,9 +515,27 @@ ENTRY(dom_crash_sync_extable) + + ENTRY(common_interrupt) + SAVE_ALL CLAC ++ ++ GET_STACK_END(14) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov %rcx, %r15 ++ neg %rcx ++ jz .Lintr_cr3_okay ++ jns .Lintr_cr3_load ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ neg %rcx ++.Lintr_cr3_load: ++ write_cr3 rcx, rdi, rsi ++ xor %ecx, %ecx ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ testb $3, UREGS_cs(%rsp) ++ cmovnz %rcx, %r15 ++.Lintr_cr3_okay: ++ + CR4_PV32_RESTORE + movq %rsp,%rdi + callq do_IRQ ++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + jmp ret_from_intr + + /* No special register assumptions. */ +@@ -461,6 +553,23 @@ ENTRY(page_fault) + /* No special register assumptions. */ + GLOBAL(handle_exception) + SAVE_ALL CLAC ++ ++ GET_STACK_END(14) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov %rcx, %r15 ++ neg %rcx ++ jz .Lxcpt_cr3_okay ++ jns .Lxcpt_cr3_load ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ neg %rcx ++.Lxcpt_cr3_load: ++ write_cr3 rcx, rdi, rsi ++ xor %ecx, %ecx ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ testb $3, UREGS_cs(%rsp) ++ cmovnz %rcx, %r15 ++.Lxcpt_cr3_okay: ++ + handle_exception_saved: + GET_CURRENT(bx) + testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) +@@ -525,6 +634,7 @@ handle_exception_saved: + leaq exception_table(%rip),%rdx + PERFC_INCR(exceptions, %rax, %rbx) + callq *(%rdx,%rax,8) ++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + testb $3,UREGS_cs(%rsp) + jz restore_all_xen + leaq VCPU_trap_bounce(%rbx),%rdx +@@ -557,6 +667,7 @@ exception_with_ints_disabled: + rep; movsq # make room for ec/ev + 1: movq UREGS_error_code(%rsp),%rax # ec/ev + movq %rax,UREGS_kernel_sizeof(%rsp) ++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + jmp restore_all_xen # return to fixup code + + /* No special register assumptions. */ +@@ -634,6 +745,17 @@ ENTRY(double_fault) + movl $TRAP_double_fault,4(%rsp) + /* Set AC to reduce chance of further SMAP faults */ + SAVE_ALL STAC ++ ++ GET_STACK_END(bx) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rbx ++ test %rbx, %rbx ++ jz .Ldblf_cr3_okay ++ jns .Ldblf_cr3_load ++ neg %rbx ++.Ldblf_cr3_load: ++ write_cr3 rbx, rdi, rsi ++.Ldblf_cr3_okay: ++ + movq %rsp,%rdi + call do_double_fault + BUG /* do_double_fault() shouldn't return. */ +@@ -652,10 +774,28 @@ ENTRY(nmi) + movl $TRAP_nmi,4(%rsp) + handle_ist_exception: + SAVE_ALL CLAC ++ ++ GET_STACK_END(14) ++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx ++ mov %rcx, %r15 ++ neg %rcx ++ jz .List_cr3_okay ++ jns .List_cr3_load ++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++ neg %rcx ++.List_cr3_load: ++ write_cr3 rcx, rdi, rsi ++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14) ++.List_cr3_okay: ++ + CR4_PV32_RESTORE + testb $3,UREGS_cs(%rsp) + jz 1f +- /* Interrupted guest context. Copy the context to stack bottom. */ ++ /* ++ * Interrupted guest context. Clear the restore value for xen_cr3 ++ * and copy the context to stack bottom. ++ */ ++ xor %r15, %r15 + GET_CPUINFO_FIELD(guest_cpu_user_regs,di) + movq %rsp,%rsi + movl $UREGS_kernel_sizeof/8,%ecx +@@ -665,6 +805,7 @@ handle_ist_exception: + movzbl UREGS_entry_vector(%rsp),%eax + leaq exception_table(%rip),%rdx + callq *(%rdx,%rax,8) ++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + cmpb $TRAP_nmi,UREGS_entry_vector(%rsp) + jne ret_from_intr + +diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h +index 6e5c079ad8..6cfdaa1aa0 100644 +--- a/xen/include/asm-x86/asm_defns.h ++++ b/xen/include/asm-x86/asm_defns.h +@@ -93,9 +93,30 @@ void ret_from_intr(void); + UNLIKELY_DONE(mp, tag); \ + __UNLIKELY_END(tag) + ++ .equ .Lrax, 0 ++ .equ .Lrcx, 1 ++ .equ .Lrdx, 2 ++ .equ .Lrbx, 3 ++ .equ .Lrsp, 4 ++ .equ .Lrbp, 5 ++ .equ .Lrsi, 6 ++ .equ .Lrdi, 7 ++ .equ .Lr8, 8 ++ .equ .Lr9, 9 ++ .equ .Lr10, 10 ++ .equ .Lr11, 11 ++ .equ .Lr12, 12 ++ .equ .Lr13, 13 ++ .equ .Lr14, 14 ++ .equ .Lr15, 15 ++ + #define STACK_CPUINFO_FIELD(field) (1 - CPUINFO_sizeof + CPUINFO_##field) + #define GET_STACK_END(reg) \ ++ .if .Lr##reg > 8; \ ++ movq $STACK_SIZE-1, %r##reg; \ ++ .else; \ + movl $STACK_SIZE-1, %e##reg; \ ++ .endif; \ + orq %rsp, %r##reg + + #define GET_CPUINFO_FIELD(field, reg) \ +@@ -177,6 +198,15 @@ void ret_from_intr(void); + #define ASM_STAC ASM_AC(STAC) + #define ASM_CLAC ASM_AC(CLAC) + ++.macro write_cr3 val:req, tmp1:req, tmp2:req ++ mov %cr4, %\tmp1 ++ mov %\tmp1, %\tmp2 ++ and $~X86_CR4_PGE, %\tmp1 ++ mov %\tmp1, %cr4 ++ mov %\val, %cr3 ++ mov %\tmp2, %cr4 ++.endm ++ + #define CR4_PV32_RESTORE \ + 667: ASM_NOP5; \ + .pushsection .altinstr_replacement, "ax"; \ +diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h +index e6587e684c..397fa4c38f 100644 +--- a/xen/include/asm-x86/current.h ++++ b/xen/include/asm-x86/current.h +@@ -42,6 +42,18 @@ struct cpu_info { + struct vcpu *current_vcpu; + unsigned long per_cpu_offset; + unsigned long cr4; ++ /* ++ * Of the two following fields the latter is being set to the CR3 value ++ * to be used on the given pCPU for loading whenever 64-bit PV guest ++ * context is being entered. The value never changes once set. ++ * The former is the value to restore when re-entering Xen, if any. IOW ++ * its value being zero means there's nothing to restore. However, its ++ * value can also be negative, indicating to the exit-to-Xen code that ++ * restoring is not necessary, but allowing any nested entry code paths ++ * to still know the value to put back into CR3. ++ */ ++ unsigned long xen_cr3; ++ unsigned long pv_cr3; + /* get_stack_bottom() must be 16-byte aligned */ + }; + +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index ccd406a3fe..9906f38f2d 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -517,6 +517,7 @@ extern idt_entry_t idt_table[]; + extern idt_entry_t *idt_tables[]; + + DECLARE_PER_CPU(struct tss_struct, init_tss); ++DECLARE_PER_CPU(root_pgentry_t *, root_pgt); + + extern void init_int80_direct_trap(struct vcpu *v); + +diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h +index 589f22552e..afc77c3237 100644 +--- a/xen/include/asm-x86/x86_64/page.h ++++ b/xen/include/asm-x86/x86_64/page.h +@@ -25,8 +25,8 @@ + /* These are architectural limits. Current CPUs support only 40-bit phys. */ + #define PADDR_BITS 52 + #define VADDR_BITS 48 +-#define PADDR_MASK ((1UL << PADDR_BITS)-1) +-#define VADDR_MASK ((1UL << VADDR_BITS)-1) ++#define PADDR_MASK ((_AC(1,UL) << PADDR_BITS) - 1) ++#define VADDR_MASK ((_AC(1,UL) << VADDR_BITS) - 1) + + #define is_canonical_address(x) (((long)(x) >> 47) == ((long)(x) >> 63)) + +@@ -117,6 +117,7 @@ typedef l4_pgentry_t root_pgentry_t; + : (((_s) < ROOT_PAGETABLE_FIRST_XEN_SLOT) || \ + ((_s) > ROOT_PAGETABLE_LAST_XEN_SLOT))) + ++#define root_table_offset l4_table_offset + #define root_get_pfn l4e_get_pfn + #define root_get_flags l4e_get_flags + #define root_get_intpte l4e_get_intpte +-- +2.15.1 + diff --git a/emulators/xen-kernel/files/0001-x86-compat-fix-compilation-errors-with-clang-6.patch b/emulators/xen-kernel/files/0001-x86-compat-fix-compilation-errors-with-clang-6.patch new file mode 100644 index 000000000000..a75cf8b29281 --- /dev/null +++ b/emulators/xen-kernel/files/0001-x86-compat-fix-compilation-errors-with-clang-6.patch @@ -0,0 +1,76 @@ +From 58e028648e3bc831b1b60a39b7f1661538fa6a34 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne <roger.pau@citrix.com> +Date: Tue, 23 Jan 2018 16:05:17 +0000 +Subject: [PATCH] x86/compat: fix compilation errors with clang 6 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The following errors are generated when compiling Xen with clang 6: + +In file included from x86_64/asm-offsets.c:9: +In file included from /root/src/xen/xen/include/xen/sched.h:8: +In file included from /root/src/xen/xen/include/xen/shared.h:6: +In file included from /root/src/xen/xen/include/compat/arch-x86/../xen.h:9: +/root/src/xen/xen/include/compat/arch-x86/xen.h:10:10: error: the current #pragma pack aligment + value is modified in the included file [-Werror,-Wpragma-pack] +#include "xen-x86_32.h" + ^ +/root/src/xen/xen/include/compat/arch-x86/xen-x86_32.h:40:9: note: previous '#pragma pack' + directive that modifies alignment is here +#pragma pack() + ^ +In file included from x86_64/asm-offsets.c:9: +In file included from /root/src/xen/xen/include/xen/sched.h:8: +In file included from /root/src/xen/xen/include/xen/shared.h:6: +/root/src/xen/xen/include/compat/arch-x86/../xen.h:9:10: error: the current #pragma pack aligment + value is modified in the included file [-Werror,-Wpragma-pack] +#include "arch-x86/xen.h" + ^ +/root/src/xen/xen/include/compat/arch-x86/xen.h:71:9: note: previous '#pragma pack' directive that + modifies alignment is here +#pragma pack() + ^ +2 errors generated. + +Fix this by using pragma push/pop in order to store the current pragma +value in the compiler stack and later restoring it when using clang. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +--- +Cc: Andrew Cooper <andrew.cooper3@citrix.com> +Cc: George Dunlap <George.Dunlap@eu.citrix.com> +Cc: Ian Jackson <ian.jackson@eu.citrix.com> +Cc: Jan Beulich <jbeulich@suse.com> +Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Cc: Stefano Stabellini <sstabellini@kernel.org> +Cc: Tim Deegan <tim@xen.org> +Cc: Wei Liu <wei.liu2@citrix.com> +--- +Changes since v1: + - Only use push/pop with clang. +--- + xen/include/Makefile | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/xen/include/Makefile b/xen/include/Makefile +index 268bc9d6ba..eeae942903 100644 +--- a/xen/include/Makefile ++++ b/xen/include/Makefile +@@ -34,8 +34,13 @@ cppflags-y := -include public/xen-compat.h + cppflags-$(CONFIG_X86) += -m32 + + # 8-byte types are 4-byte aligned on x86_32 ... ++ifeq ($(clang),y) ++prefix-$(CONFIG_X86) := \#pragma pack(push, 4) ++suffix-$(CONFIG_X86) := \#pragma pack(pop) ++else + prefix-$(CONFIG_X86) := \#pragma pack(4) + suffix-$(CONFIG_X86) := \#pragma pack() ++endif + + endif + +-- +2.15.1 + diff --git a/emulators/xen-kernel/files/0002-p2m-Check-return-value-of-p2m_set_entry-when-decreas.patch b/emulators/xen-kernel/files/0002-p2m-Check-return-value-of-p2m_set_entry-when-decreas.patch new file mode 100644 index 000000000000..e72d7511b3be --- /dev/null +++ b/emulators/xen-kernel/files/0002-p2m-Check-return-value-of-p2m_set_entry-when-decreas.patch @@ -0,0 +1,109 @@ +From 01feeda5363dd8d2fea8395c2c435203751c8ba5 Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Fri, 10 Nov 2017 16:53:55 +0000 +Subject: [PATCH 2/2] p2m: Check return value of p2m_set_entry() when + decreasing reservation + +If the entire range specified to p2m_pod_decrease_reservation() is marked +populate-on-demand, then it will make a single p2m_set_entry() call, +reducing its PoD entry count. + +Unfortunately, in the right circumstances, this p2m_set_entry() call +may fail. It that case, repeated calls to decrease_reservation() may +cause p2m->pod.entry_count to fall below zero, potentially tripping +over BUG_ON()s to the contrary. + +Instead, check to see if the entry succeeded, and return false if not. +The caller will then call guest_remove_page() on the gfns, which will +return -EINVAL upon finding no valid memory there to return. + +Unfortunately if the order > 0, the entry may have partially changed. +A domain_crash() is probably the safest thing in that case. + +Other p2m_set_entry() calls in the same function should be fine, +because they are writing the entry at its current order. Nonetheless, +check the return value and crash if our assumption turns otu to be +wrong. + +This is part of XSA-247. + +Reported-by: George Dunlap <george.dunlap.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- +v2: Crash the domain if we're not sure it's safe (or if we think it +can't happen) +--- + xen/arch/x86/mm/p2m-pod.c | 42 +++++++++++++++++++++++++++++++++--------- + 1 file changed, 33 insertions(+), 9 deletions(-) + +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index 5ec8a37949..91d309647e 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -557,11 +557,23 @@ p2m_pod_decrease_reservation(struct domain *d, + + if ( !nonpod ) + { +- /* All PoD: Mark the whole region invalid and tell caller +- * we're done. */ +- p2m_set_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, +- p2m->default_access); +- p2m->pod.entry_count-=(1<<order); ++ /* ++ * All PoD: Mark the whole region invalid and tell caller ++ * we're done. ++ */ ++ if ( p2m_set_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, ++ p2m->default_access) ) ++ { ++ /* ++ * If this fails, we can't tell how much of the range was changed. ++ * Best to crash the domain unless we're sure a partial change is ++ * impossible. ++ */ ++ if ( order != 0 ) ++ domain_crash(d); ++ goto out_unlock; ++ } ++ p2m->pod.entry_count -= 1UL << order; + BUG_ON(p2m->pod.entry_count < 0); + ret = 1; + goto out_entry_check; +@@ -602,8 +614,14 @@ p2m_pod_decrease_reservation(struct domain *d, + n = 1UL << cur_order; + if ( t == p2m_populate_on_demand ) + { +- p2m_set_entry(p2m, gpfn + i, _mfn(INVALID_MFN), cur_order, +- p2m_invalid, p2m->default_access); ++ /* This shouldn't be able to fail */ ++ if ( p2m_set_entry(p2m, gpfn + i, _mfn(INVALID_MFN), cur_order, ++ p2m_invalid, p2m->default_access) ) ++ { ++ ASSERT_UNREACHABLE(); ++ domain_crash(d); ++ goto out_unlock; ++ } + p2m->pod.entry_count -= n; + BUG_ON(p2m->pod.entry_count < 0); + pod -= n; +@@ -624,8 +642,14 @@ p2m_pod_decrease_reservation(struct domain *d, + + page = mfn_to_page(mfn); + +- p2m_set_entry(p2m, gpfn + i, _mfn(INVALID_MFN), cur_order, +- p2m_invalid, p2m->default_access); ++ /* This shouldn't be able to fail */ ++ if ( p2m_set_entry(p2m, gpfn + i, _mfn(INVALID_MFN), cur_order, ++ p2m_invalid, p2m->default_access) ) ++ { ++ ASSERT_UNREACHABLE(); ++ domain_crash(d); ++ goto out_unlock; ++ } + p2m_tlb_flush_sync(p2m); + for ( j = 0; j < n; ++j ) + set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY); +-- +2.15.0 + diff --git a/emulators/xen-kernel/files/0002-x86-allow-Meltdown-band-aid-to-be-disabled.patch b/emulators/xen-kernel/files/0002-x86-allow-Meltdown-band-aid-to-be-disabled.patch new file mode 100644 index 000000000000..20894e12cc19 --- /dev/null +++ b/emulators/xen-kernel/files/0002-x86-allow-Meltdown-band-aid-to-be-disabled.patch @@ -0,0 +1,163 @@ +From e19d0af4ee2ae9e42a85db639fd6848e72f5658b Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 17 Jan 2018 17:24:59 +0100 +Subject: [PATCH 2/2] x86: allow Meltdown band-aid to be disabled + +First of all we don't need it on AMD systems. Additionally allow its use +to be controlled by command line option. For best backportability, this +intentionally doesn't use alternative instruction patching to achieve +the intended effect - while we likely want it, this will be later +follow-up. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: e871e80c38547d9faefc6604532ba3e985e65873 +master date: 2018-01-16 17:50:59 +0100 +--- + docs/misc/xen-command-line.markdown | 12 ++++++++++++ + xen/arch/x86/domain.c | 7 +++++-- + xen/arch/x86/mm.c | 2 +- + xen/arch/x86/smpboot.c | 17 ++++++++++++++--- + xen/arch/x86/x86_64/entry.S | 2 ++ + 5 files changed, 34 insertions(+), 6 deletions(-) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 2dacb5d073..aecf9fd49d 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1621,6 +1621,18 @@ In the case that x2apic is in use, this option switches between physical and + clustered mode. The default, given no hint from the **FADT**, is cluster + mode. + ++### xpti ++> `= <boolean>` ++ ++> Default: `false` on AMD hardware ++> Default: `true` everywhere else ++ ++Override default selection of whether to isolate 64-bit PV guest page ++tables. ++ ++** WARNING: Not yet a complete isolation implementation, but better than ++nothing. ** ++ + ### xsave + > `= <boolean>` + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 3cf18f95b7..a1bda5e12d 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1945,12 +1945,15 @@ static void paravirt_ctxt_switch_from(struct vcpu *v) + + static void paravirt_ctxt_switch_to(struct vcpu *v) + { ++ root_pgentry_t *root_pgt = this_cpu(root_pgt); + unsigned long cr4; + + switch_kernel_stack(v); + +- this_cpu(root_pgt)[root_table_offset(PERDOMAIN_VIRT_START)] = +- l4e_from_page(v->domain->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW); ++ if ( root_pgt ) ++ root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] = ++ l4e_from_page(v->domain->arch.perdomain_l3_pg, ++ __PAGE_HYPERVISOR_RW); + + cr4 = pv_guest_cr4_to_real_cr4(v); + if ( unlikely(cr4 != read_cr4()) ) +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index c9e4003989..07015e3160 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -4007,7 +4007,7 @@ long do_mmu_update( + rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, v); + if ( !rc ) +- sync_guest = 1; ++ sync_guest = !!this_cpu(root_pgt); + break; + case PGT_writable_page: + perfc_incr(writable_mmu_updates); +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index eaeec5acf0..f2f47f612a 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -320,7 +320,7 @@ void start_secondary(void *unused) + spin_debug_disable(); + + get_cpu_info()->xen_cr3 = 0; +- get_cpu_info()->pv_cr3 = __pa(this_cpu(root_pgt)); ++ get_cpu_info()->pv_cr3 = this_cpu(root_pgt) ? __pa(this_cpu(root_pgt)) : 0; + + load_system_tables(); + +@@ -729,14 +729,20 @@ static int clone_mapping(const void *ptr, root_pgentry_t *rpt) + return 0; + } + ++static __read_mostly int8_t opt_xpti = -1; ++boolean_param("xpti", opt_xpti); + DEFINE_PER_CPU(root_pgentry_t *, root_pgt); + + static int setup_cpu_root_pgt(unsigned int cpu) + { +- root_pgentry_t *rpt = alloc_xen_pagetable(); ++ root_pgentry_t *rpt; + unsigned int off; + int rc; + ++ if ( !opt_xpti ) ++ return 0; ++ ++ rpt = alloc_xen_pagetable(); + if ( !rpt ) + return -ENOMEM; + +@@ -977,10 +983,14 @@ void __init smp_prepare_cpus(unsigned int max_cpus) + + stack_base[0] = stack_start; + ++ if ( opt_xpti < 0 ) ++ opt_xpti = boot_cpu_data.x86_vendor != X86_VENDOR_AMD; ++ + rc = setup_cpu_root_pgt(0); + if ( rc ) + panic("Error %d setting up PV root page table\n", rc); +- get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0)); ++ if ( per_cpu(root_pgt, 0) ) ++ get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0)); + + set_nr_sockets(); + +@@ -1048,6 +1058,7 @@ void __init smp_prepare_boot_cpu(void) + cpumask_set_cpu(smp_processor_id(), &cpu_present_map); + + get_cpu_info()->xen_cr3 = 0; ++ get_cpu_info()->pv_cr3 = 0; + } + + static void +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index d63e734bb3..2a569952e3 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -45,6 +45,7 @@ restore_all_guest: + movabs $DIRECTMAP_VIRT_START, %rcx + mov %rdi, %rax + and %rsi, %rdi ++ jz .Lrag_keep_cr3 + and %r9, %rsi + add %rcx, %rdi + add %rcx, %rsi +@@ -61,6 +62,7 @@ restore_all_guest: + rep movsq + mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx) + write_cr3 rax, rdi, rsi ++.Lrag_keep_cr3: + + RESTORE_ALL + testw $TRAP_syscall,4(%rsp) +-- +2.15.1 + diff --git a/emulators/xen-kernel/files/xsa246-4.7.patch b/emulators/xen-kernel/files/xsa246-4.7.patch new file mode 100644 index 000000000000..bb58d6e7c840 --- /dev/null +++ b/emulators/xen-kernel/files/xsa246-4.7.patch @@ -0,0 +1,74 @@ +From: Julien Grall <julien.grall@linaro.org> +Subject: x86/pod: prevent infinite loop when shattering large pages + +When populating pages, the PoD may need to split large ones using +p2m_set_entry and request the caller to retry (see ept_get_entry for +instance). + +p2m_set_entry may fail to shatter if it is not possible to allocate +memory for the new page table. However, the error is not propagated +resulting to the callers to retry infinitely the PoD. + +Prevent the infinite loop by return false when it is not possible to +shatter the large mapping. + +This is XSA-246. + +Signed-off-by: Julien Grall <julien.grall@linaro.org> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> + +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -1073,9 +1073,8 @@ p2m_pod_demand_populate(struct p2m_domai + * NOTE: In a fine-grained p2m locking scenario this operation + * may need to promote its locking from gfn->1g superpage + */ +- p2m_set_entry(p2m, gfn_aligned, _mfn(INVALID_MFN), PAGE_ORDER_2M, +- p2m_populate_on_demand, p2m->default_access); +- return 0; ++ return p2m_set_entry(p2m, gfn_aligned, _mfn(INVALID_MFN), PAGE_ORDER_2M, ++ p2m_populate_on_demand, p2m->default_access); + } + + /* Only reclaim if we're in actual need of more cache. */ +@@ -1106,8 +1105,12 @@ p2m_pod_demand_populate(struct p2m_domai + + gfn_aligned = (gfn >> order) << order; + +- p2m_set_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, +- p2m->default_access); ++ if ( p2m_set_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, ++ p2m->default_access) ) ++ { ++ p2m_pod_cache_add(p2m, p, order); ++ goto out_fail; ++ } + + for( i = 0; i < (1UL << order); i++ ) + { +@@ -1152,13 +1155,18 @@ remap_and_retry: + BUG_ON(order != PAGE_ORDER_2M); + pod_unlock(p2m); + +- /* Remap this 2-meg region in singleton chunks */ +- /* NOTE: In a p2m fine-grained lock scenario this might +- * need promoting the gfn lock from gfn->2M superpage */ ++ /* ++ * Remap this 2-meg region in singleton chunks. See the comment on the ++ * 1G page splitting path above for why a single call suffices. ++ * ++ * NOTE: In a p2m fine-grained lock scenario this might ++ * need promoting the gfn lock from gfn->2M superpage. ++ */ + gfn_aligned = (gfn>>order)<<order; +- for(i=0; i<(1<<order); i++) +- p2m_set_entry(p2m, gfn_aligned + i, _mfn(INVALID_MFN), PAGE_ORDER_4K, +- p2m_populate_on_demand, p2m->default_access); ++ if ( p2m_set_entry(p2m, gfn_aligned, _mfn(INVALID_MFN), PAGE_ORDER_4K, ++ p2m_populate_on_demand, p2m->default_access) ) ++ return -1; ++ + if ( tb_init_done ) + { + struct { diff --git a/emulators/xen-kernel/files/xsa248-4.8.patch b/emulators/xen-kernel/files/xsa248-4.8.patch new file mode 100644 index 000000000000..d15297e78dff --- /dev/null +++ b/emulators/xen-kernel/files/xsa248-4.8.patch @@ -0,0 +1,162 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/mm: don't wrongly set page ownership + +PV domains can obtain mappings of any pages owned by the correct domain, +including ones that aren't actually assigned as "normal" RAM, but used +by Xen internally. At the moment such "internal" pages marked as owned +by a guest include pages used to track logdirty bits, as well as p2m +pages and the "unpaged pagetable" for HVM guests. Since the PV memory +management and shadow code conflict in their use of struct page_info +fields, and since shadow code is being used for log-dirty handling for +PV domains, pages coming from the shadow pool must, for PV domains, not +have the domain set as their owner. + +While the change could be done conditionally for just the PV case in +shadow code, do it unconditionally (and for consistency also for HAP), +just to be on the safe side. + +There's one special case though for shadow code: The page table used for +running a HVM guest in unpaged mode is subject to get_page() (in +set_shadow_status()) and hence must have its owner set. + +This is XSA-248. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Tim Deegan <tim@xen.org> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> + +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -283,8 +283,7 @@ static struct page_info *hap_alloc_p2m_p + { + d->arch.paging.hap.total_pages--; + d->arch.paging.hap.p2m_pages++; +- page_set_owner(pg, d); +- pg->count_info |= 1; ++ ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); + } + else if ( !d->arch.paging.p2m_alloc_failed ) + { +@@ -299,21 +298,23 @@ static struct page_info *hap_alloc_p2m_p + + static void hap_free_p2m_page(struct domain *d, struct page_info *pg) + { ++ struct domain *owner = page_get_owner(pg); ++ + /* This is called both from the p2m code (which never holds the + * paging lock) and the log-dirty code (which always does). */ + paging_lock_recursive(d); + +- ASSERT(page_get_owner(pg) == d); +- /* Should have just the one ref we gave it in alloc_p2m_page() */ +- if ( (pg->count_info & PGC_count_mask) != 1 ) { +- HAP_ERROR("Odd p2m page %p count c=%#lx t=%"PRtype_info"\n", +- pg, pg->count_info, pg->u.inuse.type_info); ++ /* Should still have no owner and count zero. */ ++ if ( owner || (pg->count_info & PGC_count_mask) ) ++ { ++ HAP_ERROR("d%d: Odd p2m page %"PRI_mfn" d=%d c=%lx t=%"PRtype_info"\n", ++ d->domain_id, mfn_x(page_to_mfn(pg)), ++ owner ? owner->domain_id : DOMID_INVALID, ++ pg->count_info, pg->u.inuse.type_info); + WARN(); ++ pg->count_info &= ~PGC_count_mask; ++ page_set_owner(pg, NULL); + } +- pg->count_info &= ~PGC_count_mask; +- /* Free should not decrement domain's total allocation, since +- * these pages were allocated without an owner. */ +- page_set_owner(pg, NULL); + d->arch.paging.hap.p2m_pages--; + d->arch.paging.hap.total_pages++; + hap_free(d, page_to_mfn(pg)); +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -1573,32 +1573,29 @@ shadow_alloc_p2m_page(struct domain *d) + pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); + d->arch.paging.shadow.p2m_pages++; + d->arch.paging.shadow.total_pages--; ++ ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); + + paging_unlock(d); + +- /* Unlike shadow pages, mark p2m pages as owned by the domain. +- * Marking the domain as the owner would normally allow the guest to +- * create mappings of these pages, but these p2m pages will never be +- * in the domain's guest-physical address space, and so that is not +- * believed to be a concern. */ +- page_set_owner(pg, d); +- pg->count_info |= 1; + return pg; + } + + static void + shadow_free_p2m_page(struct domain *d, struct page_info *pg) + { +- ASSERT(page_get_owner(pg) == d); +- /* Should have just the one ref we gave it in alloc_p2m_page() */ +- if ( (pg->count_info & PGC_count_mask) != 1 ) ++ struct domain *owner = page_get_owner(pg); ++ ++ /* Should still have no owner and count zero. */ ++ if ( owner || (pg->count_info & PGC_count_mask) ) + { +- SHADOW_ERROR("Odd p2m page count c=%#lx t=%"PRtype_info"\n", ++ SHADOW_ERROR("d%d: Odd p2m page %"PRI_mfn" d=%d c=%lx t=%"PRtype_info"\n", ++ d->domain_id, mfn_x(page_to_mfn(pg)), ++ owner ? owner->domain_id : DOMID_INVALID, + pg->count_info, pg->u.inuse.type_info); ++ pg->count_info &= ~PGC_count_mask; ++ page_set_owner(pg, NULL); + } +- pg->count_info &= ~PGC_count_mask; + pg->u.sh.type = SH_type_p2m_table; /* p2m code reuses type-info */ +- page_set_owner(pg, NULL); + + /* This is called both from the p2m code (which never holds the + * paging lock) and the log-dirty code (which always does). */ +@@ -3216,7 +3213,9 @@ int shadow_enable(struct domain *d, u32 + | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER + | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + unmap_domain_page(e); ++ pg->count_info = 1; + pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated; ++ page_set_owner(pg, d); + } + + paging_lock(d); +@@ -3254,7 +3253,11 @@ int shadow_enable(struct domain *d, u32 + if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) + p2m_teardown(p2m); + if ( rv != 0 && pg != NULL ) ++ { ++ pg->count_info &= ~PGC_count_mask; ++ page_set_owner(pg, NULL); + shadow_free_p2m_page(d, pg); ++ } + domain_unpause(d); + return rv; + } +@@ -3363,7 +3366,22 @@ out: + + /* Must be called outside the lock */ + if ( unpaged_pagetable ) ++ { ++ if ( page_get_owner(unpaged_pagetable) == d && ++ (unpaged_pagetable->count_info & PGC_count_mask) == 1 ) ++ { ++ unpaged_pagetable->count_info &= ~PGC_count_mask; ++ page_set_owner(unpaged_pagetable, NULL); ++ } ++ /* Complain here in cases where shadow_free_p2m_page() won't. */ ++ else if ( !page_get_owner(unpaged_pagetable) && ++ !(unpaged_pagetable->count_info & PGC_count_mask) ) ++ SHADOW_ERROR("d%d: Odd unpaged pt %"PRI_mfn" c=%lx t=%"PRtype_info"\n", ++ d->domain_id, mfn_x(page_to_mfn(unpaged_pagetable)), ++ unpaged_pagetable->count_info, ++ unpaged_pagetable->u.inuse.type_info); + shadow_free_p2m_page(d, unpaged_pagetable); ++ } + } + + void shadow_final_teardown(struct domain *d) diff --git a/emulators/xen-kernel/files/xsa249.patch b/emulators/xen-kernel/files/xsa249.patch new file mode 100644 index 000000000000..ecfa4305e5bf --- /dev/null +++ b/emulators/xen-kernel/files/xsa249.patch @@ -0,0 +1,42 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/shadow: fix refcount overflow check + +Commit c385d27079 ("x86 shadow: for multi-page shadows, explicitly track +the first page") reduced the refcount width to 25, without adjusting the +overflow check. Eliminate the disconnect by using a manifest constant. + +Interestingly, up to commit 047782fa01 ("Out-of-sync L1 shadows: OOS +snapshot") the refcount was 27 bits wide, yet the check was already +using 26. + +This is XSA-249. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Tim Deegan <tim@xen.org> +--- +v2: Simplify expression back to the style it was. + +--- a/xen/arch/x86/mm/shadow/private.h ++++ b/xen/arch/x86/mm/shadow/private.h +@@ -529,7 +529,7 @@ static inline int sh_get_ref(struct doma + x = sp->u.sh.count; + nx = x + 1; + +- if ( unlikely(nx >= 1U<<26) ) ++ if ( unlikely(nx >= (1U << PAGE_SH_REFCOUNT_WIDTH)) ) + { + SHADOW_PRINTK("shadow ref overflow, gmfn=%lx smfn=%lx\n", + __backpointer(sp), mfn_x(smfn)); +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -82,7 +82,8 @@ struct page_info + unsigned long type:5; /* What kind of shadow is this? */ + unsigned long pinned:1; /* Is the shadow pinned? */ + unsigned long head:1; /* Is this the first page of the shadow? */ +- unsigned long count:25; /* Reference count */ ++#define PAGE_SH_REFCOUNT_WIDTH 25 ++ unsigned long count:PAGE_SH_REFCOUNT_WIDTH; /* Reference count */ + } sh; + + /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */ diff --git a/emulators/xen-kernel/files/xsa250.patch b/emulators/xen-kernel/files/xsa250.patch new file mode 100644 index 000000000000..26aeb33fedaf --- /dev/null +++ b/emulators/xen-kernel/files/xsa250.patch @@ -0,0 +1,67 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/shadow: fix ref-counting error handling + +The old-Linux handling in shadow_set_l4e() mistakenly ORed together the +results of sh_get_ref() and sh_pin(). As the latter failing is not a +correctness problem, simply ignore its return value. + +In sh_set_toplevel_shadow() a failing sh_get_ref() must not be +accompanied by installing the entry, despite the domain being crashed. + +This is XSA-250. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Tim Deegan <tim@xen.org> + +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -923,7 +923,7 @@ static int shadow_set_l4e(struct domain + shadow_l4e_t new_sl4e, + mfn_t sl4mfn) + { +- int flags = 0, ok; ++ int flags = 0; + shadow_l4e_t old_sl4e; + paddr_t paddr; + ASSERT(sl4e != NULL); +@@ -938,15 +938,16 @@ static int shadow_set_l4e(struct domain + { + /* About to install a new reference */ + mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e); +- ok = sh_get_ref(d, sl3mfn, paddr); +- /* Are we pinning l3 shadows to handle wierd linux behaviour? */ +- if ( sh_type_is_pinnable(d, SH_type_l3_64_shadow) ) +- ok |= sh_pin(d, sl3mfn); +- if ( !ok ) ++ ++ if ( !sh_get_ref(d, sl3mfn, paddr) ) + { + domain_crash(d); + return SHADOW_SET_ERROR; + } ++ ++ /* Are we pinning l3 shadows to handle weird Linux behaviour? */ ++ if ( sh_type_is_pinnable(d, SH_type_l3_64_shadow) ) ++ sh_pin(d, sl3mfn); + } + + /* Write the new entry */ +@@ -3965,14 +3966,15 @@ sh_set_toplevel_shadow(struct vcpu *v, + + /* Take a ref to this page: it will be released in sh_detach_old_tables() + * or the next call to set_toplevel_shadow() */ +- if ( !sh_get_ref(d, smfn, 0) ) ++ if ( sh_get_ref(d, smfn, 0) ) ++ new_entry = pagetable_from_mfn(smfn); ++ else + { + SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn)); + domain_crash(d); ++ new_entry = pagetable_null(); + } + +- new_entry = pagetable_from_mfn(smfn); +- + install_new_entry: + /* Done. Install it */ + SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n", diff --git a/emulators/xen-kernel/files/xsa251-4.8.patch b/emulators/xen-kernel/files/xsa251-4.8.patch new file mode 100644 index 000000000000..fffe54d0e10a --- /dev/null +++ b/emulators/xen-kernel/files/xsa251-4.8.patch @@ -0,0 +1,21 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/paging: don't unconditionally BUG() on finding SHARED_M2P_ENTRY + +PV guests can fully control the values written into the P2M. + +This is XSA-251. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> + +--- a/xen/arch/x86/mm/paging.c ++++ b/xen/arch/x86/mm/paging.c +@@ -276,7 +276,7 @@ void paging_mark_pfn_dirty(struct domain + return; + + /* Shared MFNs should NEVER be marked dirty */ +- BUG_ON(SHARED_M2P(pfn)); ++ BUG_ON(paging_mode_translate(d) && SHARED_M2P(pfn)); + + /* + * Values with the MSB set denote MFNs that aren't really part of the |