diff options
39 files changed, 1335 insertions, 169 deletions
@@ -21,6 +21,7 @@ Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com> Andrew Morton <akpm@linux-foundation.org> Andrew Vasquez <andrew.vasquez@qlogic.com> Andy Adamson <andros@citi.umich.edu> +Antonio Ospite <ao2@ao2.it> <ao2@amarulasolutions.com> Archit Taneja <archit@ti.com> Arnaud Patard <arnaud.patard@rtp-net.org> Arnd Bergmann <arnd@arndb.de> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 053f613fc9a9..07e4cdf02407 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3025,7 +3025,7 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), which is the maximum number of possibly pending cpu-local interrupts. -4.90 KVM_SMI +4.96 KVM_SMI Capability: KVM_CAP_X86_SMM Architectures: x86 diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index cd822d8454c0..307237cfe728 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -27,6 +27,8 @@ $(warning LSE atomics not supported by binutils) endif KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads) KBUILD_AFLAGS += $(lseinstr) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 18ca9fb9e65f..86581f793e39 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -16,7 +16,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_KMEM=y CONFIG_CGROUP_HUGETLB=y # CONFIG_UTS_NS is not set # CONFIG_IPC_NS is not set @@ -37,15 +36,13 @@ CONFIG_ARCH_EXYNOS7=y CONFIG_ARCH_LAYERSCAPE=y CONFIG_ARCH_HISI=y CONFIG_ARCH_MEDIATEK=y +CONFIG_ARCH_QCOM=y CONFIG_ARCH_ROCKCHIP=y CONFIG_ARCH_SEATTLE=y CONFIG_ARCH_RENESAS=y CONFIG_ARCH_R8A7795=y CONFIG_ARCH_STRATIX10=y CONFIG_ARCH_TEGRA=y -CONFIG_ARCH_TEGRA_132_SOC=y -CONFIG_ARCH_TEGRA_210_SOC=y -CONFIG_ARCH_QCOM=y CONFIG_ARCH_SPRD=y CONFIG_ARCH_THUNDER=y CONFIG_ARCH_UNIPHIER=y @@ -54,14 +51,19 @@ CONFIG_ARCH_XGENE=y CONFIG_ARCH_ZYNQMP=y CONFIG_PCI=y CONFIG_PCI_MSI=y +CONFIG_PCI_IOV=y +CONFIG_PCI_RCAR_GEN2_PCIE=y CONFIG_PCI_HOST_GENERIC=y CONFIG_PCI_XGENE=y -CONFIG_SMP=y +CONFIG_PCI_LAYERSCAPE=y +CONFIG_PCI_HISI=y +CONFIG_PCIE_QCOM=y CONFIG_SCHED_MC=y CONFIG_PREEMPT=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CMA=y +CONFIG_XEN=y CONFIG_CMDLINE="console=ttyAMA0" # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_COMPAT=y @@ -100,7 +102,11 @@ CONFIG_PATA_OF_PLATFORM=y CONFIG_NETDEVICES=y CONFIG_TUN=y CONFIG_VIRTIO_NET=y +CONFIG_AMD_XGBE=y CONFIG_NET_XGENE=y +CONFIG_E1000E=y +CONFIG_IGB=y +CONFIG_IGBVF=y CONFIG_SKY2=y CONFIG_RAVB=y CONFIG_SMC91X=y @@ -117,25 +123,23 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_DW=y CONFIG_SERIAL_8250_MT6577=y CONFIG_SERIAL_8250_UNIPHIER=y +CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_AMBA_PL011=y CONFIG_SERIAL_AMBA_PL011_CONSOLE=y CONFIG_SERIAL_SAMSUNG=y -CONFIG_SERIAL_SAMSUNG_UARTS_4=y -CONFIG_SERIAL_SAMSUNG_UARTS=4 CONFIG_SERIAL_SAMSUNG_CONSOLE=y +CONFIG_SERIAL_TEGRA=y CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=11 CONFIG_SERIAL_SH_SCI_CONSOLE=y -CONFIG_SERIAL_TEGRA=y CONFIG_SERIAL_MSM=y CONFIG_SERIAL_MSM_CONSOLE=y -CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_XILINX_PS_UART=y CONFIG_SERIAL_XILINX_PS_UART_CONSOLE=y CONFIG_VIRTIO_CONSOLE=y # CONFIG_HW_RANDOM is not set -CONFIG_I2C=y CONFIG_I2C_QUP=y +CONFIG_I2C_UNIPHIER_F=y CONFIG_I2C_RCAR=y CONFIG_SPI=y CONFIG_SPI_PL022=y @@ -176,8 +180,6 @@ CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_SDHCI_TEGRA=y CONFIG_MMC_SPI=y CONFIG_MMC_DW=y -CONFIG_MMC_DW_IDMAC=y -CONFIG_MMC_DW_PLTFM=y CONFIG_MMC_DW_EXYNOS=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -187,28 +189,33 @@ CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_LEDS_TRIGGER_CPU=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y +CONFIG_RTC_DRV_PL031=y CONFIG_RTC_DRV_XGENE=y CONFIG_DMADEVICES=y -CONFIG_RCAR_DMAC=y CONFIG_QCOM_BAM_DMA=y CONFIG_TEGRA20_APB_DMA=y +CONFIG_RCAR_DMAC=y +CONFIG_VFIO=y +CONFIG_VFIO_PCI=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BALLOON=y CONFIG_VIRTIO_MMIO=y +CONFIG_XEN_GNTDEV=y +CONFIG_XEN_GRANT_DEV_ALLOC=y CONFIG_COMMON_CLK_CS2000_CP=y CONFIG_COMMON_CLK_QCOM=y CONFIG_MSM_GCC_8916=y CONFIG_HWSPINLOCK_QCOM=y -# CONFIG_IOMMU_SUPPORT is not set +CONFIG_ARM_SMMU=y CONFIG_QCOM_SMEM=y CONFIG_QCOM_SMD=y CONFIG_QCOM_SMD_RPM=y +CONFIG_ARCH_TEGRA_132_SOC=y +CONFIG_ARCH_TEGRA_210_SOC=y +CONFIG_HISILICON_IRQ_MBIGEN=y CONFIG_PHY_XGENE=y CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y -# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -# CONFIG_EXT3_FS_XATTR is not set -CONFIG_EXT4_FS=y CONFIG_FANOTIFY=y CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA=y @@ -239,6 +246,7 @@ CONFIG_LOCKUP_DETECTOR=y # CONFIG_FTRACE is not set CONFIG_MEMTEST=y CONFIG_SECURITY=y +CONFIG_CRYPTO_ECHAINIV=y CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_ARM64_CRYPTO=y CONFIG_CRYPTO_SHA1_ARM64_CE=y diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 2d545d7aa80b..bf464de33f52 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -67,11 +67,11 @@ extern void __pgd_error(const char *file, int line, unsigned long val); #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) -#define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) -#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE)) -#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC)) -#define PROT_NORMAL_WT (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_WT)) -#define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL)) +#define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) +#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) +#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC)) +#define PROT_NORMAL_WT (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_WT)) +#define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL)) #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) @@ -81,7 +81,7 @@ extern void __pgd_error(const char *file, int line, unsigned long val); #define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_RO __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_RDONLY) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_RDONLY) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) @@ -153,6 +153,7 @@ extern struct page *empty_zero_page; #define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE)) #define pte_exec(pte) (!(pte_val(pte) & PTE_UXN)) #define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT)) +#define pte_user(pte) (!!(pte_val(pte) & PTE_USER)) #ifdef CONFIG_ARM64_HW_AFDBM #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) @@ -163,8 +164,6 @@ extern struct page *empty_zero_page; #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) #define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID)) -#define pte_valid_user(pte) \ - ((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID) #define pte_valid_young(pte) \ @@ -278,13 +277,13 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - if (pte_valid_user(pte)) { - if (!pte_special(pte) && pte_exec(pte)) - __sync_icache_dcache(pte, addr); + if (pte_valid(pte)) { if (pte_sw_dirty(pte) && pte_write(pte)) pte_val(pte) &= ~PTE_RDONLY; else pte_val(pte) |= PTE_RDONLY; + if (pte_user(pte) && pte_exec(pte) && !pte_special(pte)) + __sync_icache_dcache(pte, addr); } /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index ffe9c2b6431b..917d98108b3f 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -514,9 +514,14 @@ CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems #endif /* EL2 debug */ + mrs x0, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx x0, x0, #8, #4 + cmp x0, #1 + b.lt 4f // Skip if no PMU present mrs x0, pmcr_el0 // Disable debug access traps ubfx x0, x0, #11, #5 // to EL2 and allow access to msr mdcr_el2, x0 // all PMU counters from EL1 +4: /* Stage-2 translation */ msr vttbr_el2, xzr diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index bc2abb8b1599..999633bd7294 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -65,6 +65,16 @@ #ifdef CONFIG_EFI /* + * Prevent the symbol aliases below from being emitted into the kallsyms + * table, by forcing them to be absolute symbols (which are conveniently + * ignored by scripts/kallsyms) rather than section relative symbols. + * The distinction is only relevant for partial linking, and only for symbols + * that are defined within a section declaration (which is not the case for + * the definitions below) so the resulting values will be identical. + */ +#define KALLSYMS_HIDE(sym) ABSOLUTE(sym) + +/* * The EFI stub has its own symbol namespace prefixed by __efistub_, to * isolate it from the kernel proper. The following symbols are legally * accessed by the stub, so provide some aliases to make them accessible. @@ -73,25 +83,25 @@ * linked at. The routines below are all implemented in assembler in a * position independent manner */ -__efistub_memcmp = __pi_memcmp; -__efistub_memchr = __pi_memchr; -__efistub_memcpy = __pi_memcpy; -__efistub_memmove = __pi_memmove; -__efistub_memset = __pi_memset; -__efistub_strlen = __pi_strlen; -__efistub_strcmp = __pi_strcmp; -__efistub_strncmp = __pi_strncmp; -__efistub___flush_dcache_area = __pi___flush_dcache_area; +__efistub_memcmp = KALLSYMS_HIDE(__pi_memcmp); +__efistub_memchr = KALLSYMS_HIDE(__pi_memchr); +__efistub_memcpy = KALLSYMS_HIDE(__pi_memcpy); +__efistub_memmove = KALLSYMS_HIDE(__pi_memmove); +__efistub_memset = KALLSYMS_HIDE(__pi_memset); +__efistub_strlen = KALLSYMS_HIDE(__pi_strlen); +__efistub_strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub_strncmp = KALLSYMS_HIDE(__pi_strncmp); +__efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); #ifdef CONFIG_KASAN -__efistub___memcpy = __pi_memcpy; -__efistub___memmove = __pi_memmove; -__efistub___memset = __pi_memset; +__efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); +__efistub___memmove = KALLSYMS_HIDE(__pi_memmove); +__efistub___memset = KALLSYMS_HIDE(__pi_memset); #endif -__efistub__text = _text; -__efistub__end = _end; -__efistub__edata = _edata; +__efistub__text = KALLSYMS_HIDE(_text); +__efistub__end = KALLSYMS_HIDE(_end); +__efistub__edata = KALLSYMS_HIDE(_edata); #endif diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 5a22a119a74c..0adbebbc2803 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -46,7 +46,7 @@ enum address_markers_idx { PCI_START_NR, PCI_END_NR, MODULES_START_NR, - MODUELS_END_NR, + MODULES_END_NR, KERNEL_SPACE_NR, }; diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cf038c7d9fa9..cab7a5be40aa 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -120,6 +120,7 @@ static void __init cpu_set_ttbr1(unsigned long ttbr1) void __init kasan_init(void) { struct memblock_region *reg; + int i; /* * We are going to perform proper setup of shadow memory. @@ -155,6 +156,14 @@ void __init kasan_init(void) pfn_to_nid(virt_to_pfn(start))); } + /* + * KAsan may reuse the contents of kasan_zero_pte directly, so we + * should make sure that it maps the zero page read-only. + */ + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(&kasan_zero_pte[i], + pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO)); + memset(kasan_zero_page, 0, PAGE_SIZE); cpu_set_ttbr1(__pa(swapper_pg_dir)); flush_tlb_all(); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 3571c7309c5e..cf6240741134 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -57,6 +57,9 @@ static int change_memory_common(unsigned long addr, int numpages, if (end < MODULES_VADDR || end >= MODULES_END) return -EINVAL; + if (!numpages) + return 0; + data.set_mask = set_mask; data.clear_mask = clear_mask; diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S index 146bd99a7532..e6a30e1268a8 100644 --- a/arch/arm64/mm/proc-macros.S +++ b/arch/arm64/mm/proc-macros.S @@ -84,3 +84,15 @@ b.lo 9998b dsb \domain .endm + +/* + * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present + */ + .macro reset_pmuserenr_el0, tmpreg + mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx \tmpreg, \tmpreg, #8, #4 + cmp \tmpreg, #1 // Skip if no PMU present + b.lt 9000f + msr pmuserenr_el0, xzr // Disable PMU access from EL0 +9000: + .endm diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index a3d867e723b4..c164d2cb35c0 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -117,7 +117,7 @@ ENTRY(cpu_do_resume) */ ubfx x11, x11, #1, #1 msr oslar_el1, x11 - msr pmuserenr_el0, xzr // Disable PMU access from EL0 + reset_pmuserenr_el0 x0 // Disable PMU access from EL0 mov x0, x12 dsb nsh // Make sure local tlb invalidation completed isb @@ -154,7 +154,7 @@ ENTRY(__cpu_setup) msr cpacr_el1, x0 // Enable FP/ASIMD mov x0, #1 << 12 // Reset mdscr_el1 and disable msr mdscr_el1, x0 // access to the DCC from EL0 - msr pmuserenr_el0, xzr // Disable PMU access from EL0 + reset_pmuserenr_el0 x0 // Disable PMU access from EL0 /* * Memory region attributes for LPAE: * diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 271fefbbe521..9d08d8cbed1a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -38,8 +38,7 @@ #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS -#define KVM_USER_MEM_SLOTS 32 -#define KVM_MEM_SLOTS_NUM KVM_USER_MEM_SLOTS +#define KVM_USER_MEM_SLOTS 512 #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 774a253ca4e1..9bf7031a67ff 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -377,15 +377,12 @@ no_seg_found: static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) { - struct kvmppc_vcpu_book3s *vcpu_book3s; u64 esid, esid_1t; int slb_nr; struct kvmppc_slb *slbe; dprintk("KVM MMU: slbmte(0x%llx, 0x%llx)\n", rs, rb); - vcpu_book3s = to_book3s(vcpu); - esid = GET_ESID(rb); esid_1t = GET_ESID_1T(rb); slb_nr = rb & 0xfff; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cff207b72c46..baeddb06811d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -833,6 +833,24 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->stat.sum_exits++; + /* + * This can happen if an interrupt occurs in the last stages + * of guest entry or the first stages of guest exit (i.e. after + * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV + * and before setting it to KVM_GUEST_MODE_HOST_HV). + * That can happen due to a bug, or due to a machine check + * occurring at just the wrong time. + */ + if (vcpu->arch.shregs.msr & MSR_HV) { + printk(KERN_EMERG "KVM trap in HV mode!\n"); + printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + kvmppc_dump_regs(vcpu); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->hw.hardware_exit_reason = vcpu->arch.trap; + return RESUME_HOST; + } run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; switch (vcpu->arch.trap) { diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 3c6badcd53ef..6ee26de9a1de 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2153,7 +2153,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ 2: rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW - rlwimi r5, r4, 1, DAWRX_WT + rlwimi r5, r4, 2, DAWRX_WT clrrdi r4, r4, 3 std r4, VCPU_DAWR(r3) std r5, VCPU_DAWRX(r3) @@ -2404,6 +2404,8 @@ machine_check_realmode: * guest as machine check causing guest to crash. */ ld r11, VCPU_MSR(r9) + rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ + bne mc_cont /* if so, exit to host */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */ beq 1f /* Deliver a machine check to guest */ ld r10, VCPU_PC(r9) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6fd2405c7f4a..a3b182dcb823 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -919,21 +919,17 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = -ENXIO; break; } - vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; break; case KVM_REG_PPC_VSCR: if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { r = -ENXIO; break; } - vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); + val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); break; case KVM_REG_PPC_VRSAVE: - if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { - r = -ENXIO; - break; - } - vcpu->arch.vrsave = set_reg_val(reg->id, val); + val = get_reg_val(reg->id, vcpu->arch.vrsave); break; #endif /* CONFIG_ALTIVEC */ default: @@ -974,17 +970,21 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = -ENXIO; break; } - val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; + vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; break; case KVM_REG_PPC_VSCR: if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { r = -ENXIO; break; } - val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); + vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); break; case KVM_REG_PPC_VRSAVE: - val = get_reg_val(reg->id, vcpu->arch.vrsave); + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); break; #endif /* CONFIG_ALTIVEC */ default: diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 6742414dbd6f..8959ebb6d2c9 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -546,7 +546,6 @@ struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; unsigned int host_acrs[NUM_ACRS]; struct fpu host_fpregs; - struct fpu guest_fpregs; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 5fce52cf0e57..5ea5af3c7db7 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select SRCU + select KVM_VFIO ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index b3b553469650..d42fa38c2429 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -7,7 +7,7 @@ # as published by the Free Software Foundation. KVM := ../../../virt/kvm -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o ccflags-y := -Ivirt/kvm -Iarch/s390/kvm diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 835d60bedb54..4af21c771f9b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1423,44 +1423,18 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) return 0; } -/* - * Backs up the current FP/VX register save area on a particular - * destination. Used to switch between different register save - * areas. - */ -static inline void save_fpu_to(struct fpu *dst) -{ - dst->fpc = current->thread.fpu.fpc; - dst->regs = current->thread.fpu.regs; -} - -/* - * Switches the FP/VX register save area from which to lazy - * restore register contents. - */ -static inline void load_fpu_from(struct fpu *from) -{ - current->thread.fpu.fpc = from->fpc; - current->thread.fpu.regs = from->regs; -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Save host register state */ save_fpu_regs(); - save_fpu_to(&vcpu->arch.host_fpregs); - - if (test_kvm_facility(vcpu->kvm, 129)) { - current->thread.fpu.fpc = vcpu->run->s.regs.fpc; - /* - * Use the register save area in the SIE-control block - * for register restore and save in kvm_arch_vcpu_put() - */ - current->thread.fpu.vxrs = - (__vector128 *)&vcpu->run->s.regs.vrs; - } else - load_fpu_from(&vcpu->arch.guest_fpregs); + vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; + vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; + /* Depending on MACHINE_HAS_VX, data stored to vrs either + * has vector register or floating point register format. + */ + current->thread.fpu.regs = vcpu->run->s.regs.vrs; + current->thread.fpu.fpc = vcpu->run->s.regs.fpc; if (test_fp_ctl(current->thread.fpu.fpc)) /* User space provided an invalid FPC, let's clear it */ current->thread.fpu.fpc = 0; @@ -1476,19 +1450,13 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); gmap_disable(vcpu->arch.gmap); + /* Save guest register state */ save_fpu_regs(); + vcpu->run->s.regs.fpc = current->thread.fpu.fpc; - if (test_kvm_facility(vcpu->kvm, 129)) - /* - * kvm_arch_vcpu_load() set up the register save area to - * the &vcpu->run->s.regs.vrs and, thus, the vector registers - * are already saved. Only the floating-point control must be - * copied. - */ - vcpu->run->s.regs.fpc = current->thread.fpu.fpc; - else - save_fpu_to(&vcpu->arch.guest_fpregs); - load_fpu_from(&vcpu->arch.host_fpregs); + /* Restore host register state */ + current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; + current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); @@ -1506,8 +1474,9 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64)); vcpu->arch.sie_block->gcr[0] = 0xE0UL; vcpu->arch.sie_block->gcr[14] = 0xC2000000UL; - vcpu->arch.guest_fpregs.fpc = 0; - asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); + /* make sure the new fpc will be lazily loaded */ + save_fpu_regs(); + current->thread.fpu.fpc = 0; vcpu->arch.sie_block->gbea = 1; vcpu->arch.sie_block->pp = 0; vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; @@ -1648,17 +1617,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.local_int.wq = &vcpu->wq; vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; - /* - * Allocate a save area for floating-point registers. If the vector - * extension is available, register contents are saved in the SIE - * control block. The allocated save area is still required in - * particular places, for example, in kvm_s390_vcpu_store_status(). - */ - vcpu->arch.guest_fpregs.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS, - GFP_KERNEL); - if (!vcpu->arch.guest_fpregs.fprs) - goto out_free_sie_block; - rc = kvm_vcpu_init(vcpu, kvm, id); if (rc) goto out_free_sie_block; @@ -1879,19 +1837,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { + /* make sure the new values will be lazily loaded */ + save_fpu_regs(); if (test_fp_ctl(fpu->fpc)) return -EINVAL; - memcpy(vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); - vcpu->arch.guest_fpregs.fpc = fpu->fpc; - save_fpu_regs(); - load_fpu_from(&vcpu->arch.guest_fpregs); + current->thread.fpu.fpc = fpu->fpc; + if (MACHINE_HAS_VX) + convert_fp_to_vx(current->thread.fpu.vxrs, (freg_t *)fpu->fprs); + else + memcpy(current->thread.fpu.fprs, &fpu->fprs, sizeof(fpu->fprs)); return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - memcpy(&fpu->fprs, vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs)); - fpu->fpc = vcpu->arch.guest_fpregs.fpc; + /* make sure we have the latest values */ + save_fpu_regs(); + if (MACHINE_HAS_VX) + convert_vx_to_fp((freg_t *)fpu->fprs, current->thread.fpu.vxrs); + else + memcpy(fpu->fprs, current->thread.fpu.fprs, sizeof(fpu->fprs)); + fpu->fpc = current->thread.fpu.fpc; return 0; } @@ -2396,6 +2362,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) { unsigned char archmode = 1; + freg_t fprs[NUM_FPRS]; unsigned int px; u64 clkcomp; int rc; @@ -2411,8 +2378,16 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) gpa = px; } else gpa -= __LC_FPREGS_SAVE_AREA; - rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA, - vcpu->arch.guest_fpregs.fprs, 128); + + /* manually convert vector registers if necessary */ + if (MACHINE_HAS_VX) { + convert_vx_to_fp(fprs, current->thread.fpu.vxrs); + rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA, + fprs, 128); + } else { + rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA, + vcpu->run->s.regs.vrs, 128); + } rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA, vcpu->run->s.regs.gprs, 128); rc |= write_guest_abs(vcpu, gpa + __LC_PSW_SAVE_AREA, @@ -2420,7 +2395,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) rc |= write_guest_abs(vcpu, gpa + __LC_PREFIX_SAVE_AREA, &px, 4); rc |= write_guest_abs(vcpu, gpa + __LC_FP_CREG_SAVE_AREA, - &vcpu->arch.guest_fpregs.fpc, 4); + &vcpu->run->s.regs.fpc, 4); rc |= write_guest_abs(vcpu, gpa + __LC_TOD_PROGREG_SAVE_AREA, &vcpu->arch.sie_block->todpr, 4); rc |= write_guest_abs(vcpu, gpa + __LC_CPU_TIMER_SAVE_AREA, @@ -2443,19 +2418,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) * it into the save area */ save_fpu_regs(); - if (test_kvm_facility(vcpu->kvm, 129)) { - /* - * If the vector extension is available, the vector registers - * which overlaps with floating-point registers are saved in - * the SIE-control block. Hence, extract the floating-point - * registers and the FPC value and store them in the - * guest_fpregs structure. - */ - vcpu->arch.guest_fpregs.fpc = current->thread.fpu.fpc; - convert_vx_to_fp(vcpu->arch.guest_fpregs.fprs, - current->thread.fpu.vxrs); - } else - save_fpu_to(&vcpu->arch.guest_fpregs); + vcpu->run->s.regs.fpc = current->thread.fpu.fpc; save_access_regs(vcpu->run->s.regs.acrs); return kvm_s390_store_status_unloaded(vcpu, addr); diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h index f887c6465a82..8a84e05adb2e 100644 --- a/arch/sh/include/asm/barrier.h +++ b/arch/sh/include/asm/barrier.h @@ -33,7 +33,6 @@ #endif #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) -#define smp_store_mb(var, value) __smp_store_mb(var, value) #include <asm-generic/barrier.h> diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 36205c27c4d0..f6bed86c17f9 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -545,6 +545,7 @@ err_enable_device: static void virtio_pci_remove(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct device *dev = get_device(&vp_dev->vdev.dev); unregister_virtio_device(&vp_dev->vdev); @@ -554,6 +555,7 @@ static void virtio_pci_remove(struct pci_dev *pci_dev) virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); + put_device(dev); } static struct pci_driver virtio_pci_driver = { diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0639dcc98195..81de7123959d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -165,7 +165,6 @@ struct ftrace_ops { ftrace_func_t saved_func; int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE - int nr_trampolines; struct ftrace_ops_hash local_hash; struct ftrace_ops_hash *func_hash; struct ftrace_ops_hash old_hash; diff --git a/include/trace/events/fence.h b/include/trace/events/fence.h index 98feb1b82896..d6dfa05ba322 100644 --- a/include/trace/events/fence.h +++ b/include/trace/events/fence.h @@ -17,7 +17,7 @@ TRACE_EVENT(fence_annotate_wait_on, TP_STRUCT__entry( __string(driver, fence->ops->get_driver_name(fence)) - __string(timeline, fence->ops->get_driver_name(fence)) + __string(timeline, fence->ops->get_timeline_name(fence)) __field(unsigned int, context) __field(unsigned int, seqno) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d4024f..15a1795bbba1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87fb9801bd9e..d9293402ee68 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1751,7 +1751,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); diff --git a/security/keys/key.c b/security/keys/key.c index 07a87311055c..09ef276c4bdc 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -430,7 +430,8 @@ static int __key_instantiate_and_link(struct key *key, /* and link it into the destination keyring */ if (keyring) { - set_bit(KEY_FLAG_KEEP, &key->flags); + if (test_bit(KEY_FLAG_KEEP, &keyring->flags)) + set_bit(KEY_FLAG_KEEP, &key->flags); __key_link(key, _edit); } diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h index 26b7926bda88..ba34f9e96efd 100644 --- a/tools/virtio/asm/barrier.h +++ b/tools/virtio/asm/barrier.h @@ -1,15 +1,19 @@ #if defined(__i386__) || defined(__x86_64__) #define barrier() asm volatile("" ::: "memory") -#define mb() __sync_synchronize() - -#define smp_mb() mb() -# define dma_rmb() barrier() -# define dma_wmb() barrier() -# define smp_rmb() barrier() -# define smp_wmb() barrier() +#define virt_mb() __sync_synchronize() +#define virt_rmb() barrier() +#define virt_wmb() barrier() +/* Atomic store should be enough, but gcc generates worse code in that case. */ +#define virt_store_mb(var, value) do { \ + typeof(var) virt_store_mb_value = (value); \ + __atomic_exchange(&(var), &virt_store_mb_value, &virt_store_mb_value, \ + __ATOMIC_SEQ_CST); \ + barrier(); \ +} while (0); /* Weak barriers should be used. If not - it's a bug */ -# define rmb() abort() -# define wmb() abort() +# define mb() abort() +# define rmb() abort() +# define wmb() abort() #else #error Please fill in barrier macros #endif diff --git a/tools/virtio/linux/compiler.h b/tools/virtio/linux/compiler.h new file mode 100644 index 000000000000..845960e1cbf2 --- /dev/null +++ b/tools/virtio/linux/compiler.h @@ -0,0 +1,9 @@ +#ifndef LINUX_COMPILER_H +#define LINUX_COMPILER_H + +#define WRITE_ONCE(var, val) \ + (*((volatile typeof(val) *)(&(var))) = (val)) + +#define READ_ONCE(var) (*((volatile typeof(val) *)(&(var)))) + +#endif diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h index 4db7d5691ba7..033849948215 100644 --- a/tools/virtio/linux/kernel.h +++ b/tools/virtio/linux/kernel.h @@ -8,6 +8,7 @@ #include <assert.h> #include <stdarg.h> +#include <linux/compiler.h> #include <linux/types.h> #include <linux/printk.h> #include <linux/bug.h> diff --git a/tools/virtio/ringtest/Makefile b/tools/virtio/ringtest/Makefile new file mode 100644 index 000000000000..feaa64ac4630 --- /dev/null +++ b/tools/virtio/ringtest/Makefile @@ -0,0 +1,22 @@ +all: + +all: ring virtio_ring_0_9 virtio_ring_poll + +CFLAGS += -Wall +CFLAGS += -pthread -O2 -ggdb +LDFLAGS += -pthread -O2 -ggdb + +main.o: main.c main.h +ring.o: ring.c main.h +virtio_ring_0_9.o: virtio_ring_0_9.c main.h +virtio_ring_poll.o: virtio_ring_poll.c virtio_ring_0_9.c main.h +ring: ring.o main.o +virtio_ring_0_9: virtio_ring_0_9.o main.o +virtio_ring_poll: virtio_ring_poll.o main.o +clean: + -rm main.o + -rm ring.o ring + -rm virtio_ring_0_9.o virtio_ring_0_9 + -rm virtio_ring_poll.o virtio_ring_poll + +.PHONY: all clean diff --git a/tools/virtio/ringtest/README b/tools/virtio/ringtest/README new file mode 100644 index 000000000000..34e94c46104f --- /dev/null +++ b/tools/virtio/ringtest/README @@ -0,0 +1,2 @@ +Partial implementation of various ring layouts, useful to tune virtio design. +Uses shared memory heavily. diff --git a/tools/virtio/ringtest/main.c b/tools/virtio/ringtest/main.c new file mode 100644 index 000000000000..3a5ff438bd62 --- /dev/null +++ b/tools/virtio/ringtest/main.c @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. + * Author: Michael S. Tsirkin <mst@redhat.com> + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Command line processing and common functions for ring benchmarking. + */ +#define _GNU_SOURCE +#include <getopt.h> +#include <pthread.h> +#include <assert.h> +#include <sched.h> +#include "main.h" +#include <sys/eventfd.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <limits.h> + +int runcycles = 10000000; +int max_outstanding = INT_MAX; +int batch = 1; + +bool do_sleep = false; +bool do_relax = false; +bool do_exit = true; + +unsigned ring_size = 256; + +static int kickfd = -1; +static int callfd = -1; + +void notify(int fd) +{ + unsigned long long v = 1; + int r; + + vmexit(); + r = write(fd, &v, sizeof v); + assert(r == sizeof v); + vmentry(); +} + +void wait_for_notify(int fd) +{ + unsigned long long v = 1; + int r; + + vmexit(); + r = read(fd, &v, sizeof v); + assert(r == sizeof v); + vmentry(); +} + +void kick(void) +{ + notify(kickfd); +} + +void wait_for_kick(void) +{ + wait_for_notify(kickfd); +} + +void call(void) +{ + notify(callfd); +} + +void wait_for_call(void) +{ + wait_for_notify(callfd); +} + +void set_affinity(const char *arg) +{ + cpu_set_t cpuset; + int ret; + pthread_t self; + long int cpu; + char *endptr; + + if (!arg) + return; + + cpu = strtol(arg, &endptr, 0); + assert(!*endptr); + + assert(cpu >= 0 || cpu < CPU_SETSIZE); + + self = pthread_self(); + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + + ret = pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset); + assert(!ret); +} + +static void run_guest(void) +{ + int completed_before; + int completed = 0; + int started = 0; + int bufs = runcycles; + int spurious = 0; + int r; + unsigned len; + void *buf; + int tokick = batch; + + for (;;) { + if (do_sleep) + disable_call(); + completed_before = completed; + do { + if (started < bufs && + started - completed < max_outstanding) { + r = add_inbuf(0, NULL, "Hello, world!"); + if (__builtin_expect(r == 0, true)) { + ++started; + if (!--tokick) { + tokick = batch; + if (do_sleep) + kick_available(); + } + + } + } else + r = -1; + + /* Flush out completed bufs if any */ + if (get_buf(&len, &buf)) { + ++completed; + if (__builtin_expect(completed == bufs, false)) + return; + r = 0; + } + } while (r == 0); + if (completed == completed_before) + ++spurious; + assert(completed <= bufs); + assert(started <= bufs); + if (do_sleep) { + if (enable_call()) + wait_for_call(); + } else { + poll_used(); + } + } +} + +static void run_host(void) +{ + int completed_before; + int completed = 0; + int spurious = 0; + int bufs = runcycles; + unsigned len; + void *buf; + + for (;;) { + if (do_sleep) { + if (enable_kick()) + wait_for_kick(); + } else { + poll_avail(); + } + if (do_sleep) + disable_kick(); + completed_before = completed; + while (__builtin_expect(use_buf(&len, &buf), true)) { + if (do_sleep) + call_used(); + ++completed; + if (__builtin_expect(completed == bufs, false)) + return; + } + if (completed == completed_before) + ++spurious; + assert(completed <= bufs); + if (completed == bufs) + break; + } +} + +void *start_guest(void *arg) +{ + set_affinity(arg); + run_guest(); + pthread_exit(NULL); +} + +void *start_host(void *arg) +{ + set_affinity(arg); + run_host(); + pthread_exit(NULL); +} + +static const char optstring[] = ""; +static const struct option longopts[] = { + { + .name = "help", + .has_arg = no_argument, + .val = 'h', + }, + { + .name = "host-affinity", + .has_arg = required_argument, + .val = 'H', + }, + { + .name = "guest-affinity", + .has_arg = required_argument, + .val = 'G', + }, + { + .name = "ring-size", + .has_arg = required_argument, + .val = 'R', + }, + { + .name = "run-cycles", + .has_arg = required_argument, + .val = 'C', + }, + { + .name = "outstanding", + .has_arg = required_argument, + .val = 'o', + }, + { + .name = "batch", + .has_arg = required_argument, + .val = 'b', + }, + { + .name = "sleep", + .has_arg = no_argument, + .val = 's', + }, + { + .name = "relax", + .has_arg = no_argument, + .val = 'x', + }, + { + .name = "exit", + .has_arg = no_argument, + .val = 'e', + }, + { + } +}; + +static void help(void) +{ + fprintf(stderr, "Usage: <test> [--help]" + " [--host-affinity H]" + " [--guest-affinity G]" + " [--ring-size R (default: %d)]" + " [--run-cycles C (default: %d)]" + " [--batch b]" + " [--outstanding o]" + " [--sleep]" + " [--relax]" + " [--exit]" + "\n", + ring_size, + runcycles); +} + +int main(int argc, char **argv) +{ + int ret; + pthread_t host, guest; + void *tret; + char *host_arg = NULL; + char *guest_arg = NULL; + char *endptr; + long int c; + + kickfd = eventfd(0, 0); + assert(kickfd >= 0); + callfd = eventfd(0, 0); + assert(callfd >= 0); + + for (;;) { + int o = getopt_long(argc, argv, optstring, longopts, NULL); + switch (o) { + case -1: + goto done; + case '?': + help(); + exit(2); + case 'H': + host_arg = optarg; + break; + case 'G': + guest_arg = optarg; + break; + case 'R': + ring_size = strtol(optarg, &endptr, 0); + assert(ring_size && !(ring_size & (ring_size - 1))); + assert(!*endptr); + break; + case 'C': + c = strtol(optarg, &endptr, 0); + assert(!*endptr); + assert(c > 0 && c < INT_MAX); + runcycles = c; + break; + case 'o': + c = strtol(optarg, &endptr, 0); + assert(!*endptr); + assert(c > 0 && c < INT_MAX); + max_outstanding = c; + break; + case 'b': + c = strtol(optarg, &endptr, 0); + assert(!*endptr); + assert(c > 0 && c < INT_MAX); + batch = c; + break; + case 's': + do_sleep = true; + break; + case 'x': + do_relax = true; + break; + case 'e': + do_exit = true; + break; + default: + help(); + exit(4); + break; + } + } + + /* does nothing here, used to make sure all smp APIs compile */ + smp_acquire(); + smp_release(); + smp_mb(); +done: + + if (batch > max_outstanding) + batch = max_outstanding; + + if (optind < argc) { + help(); + exit(4); + } + alloc_ring(); + + ret = pthread_create(&host, NULL, start_host, host_arg); + assert(!ret); + ret = pthread_create(&guest, NULL, start_guest, guest_arg); + assert(!ret); + + ret = pthread_join(guest, &tret); + assert(!ret); + ret = pthread_join(host, &tret); + assert(!ret); + return 0; +} diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h new file mode 100644 index 000000000000..16917acb0ade --- /dev/null +++ b/tools/virtio/ringtest/main.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. + * Author: Michael S. Tsirkin <mst@redhat.com> + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Common macros and functions for ring benchmarking. + */ +#ifndef MAIN_H +#define MAIN_H + +#include <stdbool.h> + +extern bool do_exit; + +#if defined(__x86_64__) || defined(__i386__) +#include "x86intrin.h" + +static inline void wait_cycles(unsigned long long cycles) +{ + unsigned long long t; + + t = __rdtsc(); + while (__rdtsc() - t < cycles) {} +} + +#define VMEXIT_CYCLES 500 +#define VMENTRY_CYCLES 500 + +#else +static inline void wait_cycles(unsigned long long cycles) +{ + _Exit(5); +} +#define VMEXIT_CYCLES 0 +#define VMENTRY_CYCLES 0 +#endif + +static inline void vmexit(void) +{ + if (!do_exit) + return; + + wait_cycles(VMEXIT_CYCLES); +} +static inline void vmentry(void) +{ + if (!do_exit) + return; + + wait_cycles(VMENTRY_CYCLES); +} + +/* implemented by ring */ +void alloc_ring(void); +/* guest side */ +int add_inbuf(unsigned, void *, void *); +void *get_buf(unsigned *, void **); +void disable_call(); +bool enable_call(); +void kick_available(); +void poll_used(); +/* host side */ +void disable_kick(); +bool enable_kick(); +bool use_buf(unsigned *, void **); +void call_used(); +void poll_avail(); + +/* implemented by main */ +extern bool do_sleep; +void kick(void); +void wait_for_kick(void); +void call(void); +void wait_for_call(void); + +extern unsigned ring_size; + +/* Compiler barrier - similar to what Linux uses */ +#define barrier() asm volatile("" ::: "memory") + +/* Is there a portable way to do this? */ +#if defined(__x86_64__) || defined(__i386__) +#define cpu_relax() asm ("rep; nop" ::: "memory") +#else +#define cpu_relax() assert(0) +#endif + +extern bool do_relax; + +static inline void busy_wait(void) +{ + if (do_relax) + cpu_relax(); + else + /* prevent compiler from removing busy loops */ + barrier(); +} + +/* + * Not using __ATOMIC_SEQ_CST since gcc docs say they are only synchronized + * with other __ATOMIC_SEQ_CST calls. + */ +#define smp_mb() __sync_synchronize() + +/* + * This abuses the atomic builtins for thread fences, and + * adds a compiler barrier. + */ +#define smp_release() do { \ + barrier(); \ + __atomic_thread_fence(__ATOMIC_RELEASE); \ +} while (0) + +#define smp_acquire() do { \ + __atomic_thread_fence(__ATOMIC_ACQUIRE); \ + barrier(); \ +} while (0) + +#endif diff --git a/tools/virtio/ringtest/ring.c b/tools/virtio/ringtest/ring.c new file mode 100644 index 000000000000..c25c8d248b6b --- /dev/null +++ b/tools/virtio/ringtest/ring.c @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. + * Author: Michael S. Tsirkin <mst@redhat.com> + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Simple descriptor-based ring. virtio 0.9 compatible event index is used for + * signalling, unconditionally. + */ +#define _GNU_SOURCE +#include "main.h" +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +/* Next - Where next entry will be written. + * Prev - "Next" value when event triggered previously. + * Event - Peer requested event after writing this entry. + */ +static inline bool need_event(unsigned short event, + unsigned short next, + unsigned short prev) +{ + return (unsigned short)(next - event - 1) < (unsigned short)(next - prev); +} + +/* Design: + * Guest adds descriptors with unique index values and DESC_HW in flags. + * Host overwrites used descriptors with correct len, index, and DESC_HW clear. + * Flags are always set last. + */ +#define DESC_HW 0x1 + +struct desc { + unsigned short flags; + unsigned short index; + unsigned len; + unsigned long long addr; +}; + +/* how much padding is needed to avoid false cache sharing */ +#define HOST_GUEST_PADDING 0x80 + +/* Mostly read */ +struct event { + unsigned short kick_index; + unsigned char reserved0[HOST_GUEST_PADDING - 2]; + unsigned short call_index; + unsigned char reserved1[HOST_GUEST_PADDING - 2]; +}; + +struct data { + void *buf; /* descriptor is writeable, we can't get buf from there */ + void *data; +} *data; + +struct desc *ring; +struct event *event; + +struct guest { + unsigned avail_idx; + unsigned last_used_idx; + unsigned num_free; + unsigned kicked_avail_idx; + unsigned char reserved[HOST_GUEST_PADDING - 12]; +} guest; + +struct host { + /* we do not need to track last avail index + * unless we have more than one in flight. + */ + unsigned used_idx; + unsigned called_used_idx; + unsigned char reserved[HOST_GUEST_PADDING - 4]; +} host; + +/* implemented by ring */ +void alloc_ring(void) +{ + int ret; + int i; + + ret = posix_memalign((void **)&ring, 0x1000, ring_size * sizeof *ring); + if (ret) { + perror("Unable to allocate ring buffer.\n"); + exit(3); + } + event = malloc(sizeof *event); + if (!event) { + perror("Unable to allocate event buffer.\n"); + exit(3); + } + memset(event, 0, sizeof *event); + guest.avail_idx = 0; + guest.kicked_avail_idx = -1; + guest.last_used_idx = 0; + host.used_idx = 0; + host.called_used_idx = -1; + for (i = 0; i < ring_size; ++i) { + struct desc desc = { + .index = i, + }; + ring[i] = desc; + } + guest.num_free = ring_size; + data = malloc(ring_size * sizeof *data); + if (!data) { + perror("Unable to allocate data buffer.\n"); + exit(3); + } + memset(data, 0, ring_size * sizeof *data); +} + +/* guest side */ +int add_inbuf(unsigned len, void *buf, void *datap) +{ + unsigned head, index; + + if (!guest.num_free) + return -1; + + guest.num_free--; + head = (ring_size - 1) & (guest.avail_idx++); + + /* Start with a write. On MESI architectures this helps + * avoid a shared state with consumer that is polling this descriptor. + */ + ring[head].addr = (unsigned long)(void*)buf; + ring[head].len = len; + /* read below might bypass write above. That is OK because it's just an + * optimization. If this happens, we will get the cache line in a + * shared state which is unfortunate, but probably not worth it to + * add an explicit full barrier to avoid this. + */ + barrier(); + index = ring[head].index; + data[index].buf = buf; + data[index].data = datap; + /* Barrier A (for pairing) */ + smp_release(); + ring[head].flags = DESC_HW; + + return 0; +} + +void *get_buf(unsigned *lenp, void **bufp) +{ + unsigned head = (ring_size - 1) & guest.last_used_idx; + unsigned index; + void *datap; + + if (ring[head].flags & DESC_HW) + return NULL; + /* Barrier B (for pairing) */ + smp_acquire(); + *lenp = ring[head].len; + index = ring[head].index & (ring_size - 1); + datap = data[index].data; + *bufp = data[index].buf; + data[index].buf = NULL; + data[index].data = NULL; + guest.num_free++; + guest.last_used_idx++; + return datap; +} + +void poll_used(void) +{ + unsigned head = (ring_size - 1) & guest.last_used_idx; + + while (ring[head].flags & DESC_HW) + busy_wait(); +} + +void disable_call() +{ + /* Doing nothing to disable calls might cause + * extra interrupts, but reduces the number of cache misses. + */ +} + +bool enable_call() +{ + unsigned head = (ring_size - 1) & guest.last_used_idx; + + event->call_index = guest.last_used_idx; + /* Flush call index write */ + /* Barrier D (for pairing) */ + smp_mb(); + return ring[head].flags & DESC_HW; +} + +void kick_available(void) +{ + /* Flush in previous flags write */ + /* Barrier C (for pairing) */ + smp_mb(); + if (!need_event(event->kick_index, + guest.avail_idx, + guest.kicked_avail_idx)) + return; + + guest.kicked_avail_idx = guest.avail_idx; + kick(); +} + +/* host side */ +void disable_kick() +{ + /* Doing nothing to disable kicks might cause + * extra interrupts, but reduces the number of cache misses. + */ +} + +bool enable_kick() +{ + unsigned head = (ring_size - 1) & host.used_idx; + + event->kick_index = host.used_idx; + /* Barrier C (for pairing) */ + smp_mb(); + return !(ring[head].flags & DESC_HW); +} + +void poll_avail(void) +{ + unsigned head = (ring_size - 1) & host.used_idx; + + while (!(ring[head].flags & DESC_HW)) + busy_wait(); +} + +bool use_buf(unsigned *lenp, void **bufp) +{ + unsigned head = (ring_size - 1) & host.used_idx; + + if (!(ring[head].flags & DESC_HW)) + return false; + + /* make sure length read below is not speculated */ + /* Barrier A (for pairing) */ + smp_acquire(); + + /* simple in-order completion: we don't need + * to touch index at all. This also means we + * can just modify the descriptor in-place. + */ + ring[head].len--; + /* Make sure len is valid before flags. + * Note: alternative is to write len and flags in one access - + * possible on 64 bit architectures but wmb is free on Intel anyway + * so I have no way to test whether it's a gain. + */ + /* Barrier B (for pairing) */ + smp_release(); + ring[head].flags = 0; + host.used_idx++; + return true; +} + +void call_used(void) +{ + /* Flush in previous flags write */ + /* Barrier D (for pairing) */ + smp_mb(); + if (!need_event(event->call_index, + host.used_idx, + host.called_used_idx)) + return; + + host.called_used_idx = host.used_idx; + call(); +} diff --git a/tools/virtio/ringtest/run-on-all.sh b/tools/virtio/ringtest/run-on-all.sh new file mode 100755 index 000000000000..52b0f71ffa8d --- /dev/null +++ b/tools/virtio/ringtest/run-on-all.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +#use last CPU for host. Why not the first? +#many devices tend to use cpu0 by default so +#it tends to be busier +HOST_AFFINITY=$(cd /dev/cpu; ls|grep -v '[a-z]'|sort -n|tail -1) + +#run command on all cpus +for cpu in $(cd /dev/cpu; ls|grep -v '[a-z]'|sort -n); +do + #Don't run guest and host on same CPU + #It actually works ok if using signalling + if + (echo "$@" | grep -e "--sleep" > /dev/null) || \ + test $HOST_AFFINITY '!=' $cpu + then + echo "GUEST AFFINITY $cpu" + "$@" --host-affinity $HOST_AFFINITY --guest-affinity $cpu + fi +done +echo "NO GUEST AFFINITY" +"$@" --host-affinity $HOST_AFFINITY +echo "NO AFFINITY" +"$@" diff --git a/tools/virtio/ringtest/virtio_ring_0_9.c b/tools/virtio/ringtest/virtio_ring_0_9.c new file mode 100644 index 000000000000..47c9a1a18d36 --- /dev/null +++ b/tools/virtio/ringtest/virtio_ring_0_9.c @@ -0,0 +1,316 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. + * Author: Michael S. Tsirkin <mst@redhat.com> + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Partial implementation of virtio 0.9. event index is used for signalling, + * unconditionally. Design roughly follows linux kernel implementation in order + * to be able to judge its performance. + */ +#define _GNU_SOURCE +#include "main.h" +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include <string.h> +#include <linux/virtio_ring.h> + +struct data { + void *data; +} *data; + +struct vring ring; + +/* enabling the below activates experimental ring polling code + * (which skips index reads on consumer in favor of looking at + * high bits of ring id ^ 0x8000). + */ +/* #ifdef RING_POLL */ + +/* how much padding is needed to avoid false cache sharing */ +#define HOST_GUEST_PADDING 0x80 + +struct guest { + unsigned short avail_idx; + unsigned short last_used_idx; + unsigned short num_free; + unsigned short kicked_avail_idx; + unsigned short free_head; + unsigned char reserved[HOST_GUEST_PADDING - 10]; +} guest; + +struct host { + /* we do not need to track last avail index + * unless we have more than one in flight. + */ + unsigned short used_idx; + unsigned short called_used_idx; + unsigned char reserved[HOST_GUEST_PADDING - 4]; +} host; + +/* implemented by ring */ +void alloc_ring(void) +{ + int ret; + int i; + void *p; + + ret = posix_memalign(&p, 0x1000, vring_size(ring_size, 0x1000)); + if (ret) { + perror("Unable to allocate ring buffer.\n"); + exit(3); + } + memset(p, 0, vring_size(ring_size, 0x1000)); + vring_init(&ring, ring_size, p, 0x1000); + + guest.avail_idx = 0; + guest.kicked_avail_idx = -1; + guest.last_used_idx = 0; + /* Put everything in free lists. */ + guest.free_head = 0; + for (i = 0; i < ring_size - 1; i++) + ring.desc[i].next = i + 1; + host.used_idx = 0; + host.called_used_idx = -1; + guest.num_free = ring_size; + data = malloc(ring_size * sizeof *data); + if (!data) { + perror("Unable to allocate data buffer.\n"); + exit(3); + } + memset(data, 0, ring_size * sizeof *data); +} + +/* guest side */ +int add_inbuf(unsigned len, void *buf, void *datap) +{ + unsigned head, avail; + struct vring_desc *desc; + + if (!guest.num_free) + return -1; + + head = guest.free_head; + guest.num_free--; + + desc = ring.desc; + desc[head].flags = VRING_DESC_F_NEXT; + desc[head].addr = (unsigned long)(void *)buf; + desc[head].len = len; + /* We do it like this to simulate the way + * we'd have to flip it if we had multiple + * descriptors. + */ + desc[head].flags &= ~VRING_DESC_F_NEXT; + guest.free_head = desc[head].next; + + data[head].data = datap; + +#ifdef RING_POLL + /* Barrier A (for pairing) */ + smp_release(); + avail = guest.avail_idx++; + ring.avail->ring[avail & (ring_size - 1)] = + (head | (avail & ~(ring_size - 1))) ^ 0x8000; +#else + avail = (ring_size - 1) & (guest.avail_idx++); + ring.avail->ring[avail] = head; + /* Barrier A (for pairing) */ + smp_release(); +#endif + ring.avail->idx = guest.avail_idx; + return 0; +} + +void *get_buf(unsigned *lenp, void **bufp) +{ + unsigned head; + unsigned index; + void *datap; + +#ifdef RING_POLL + head = (ring_size - 1) & guest.last_used_idx; + index = ring.used->ring[head].id; + if ((index ^ guest.last_used_idx ^ 0x8000) & ~(ring_size - 1)) + return NULL; + /* Barrier B (for pairing) */ + smp_acquire(); + index &= ring_size - 1; +#else + if (ring.used->idx == guest.last_used_idx) + return NULL; + /* Barrier B (for pairing) */ + smp_acquire(); + head = (ring_size - 1) & guest.last_used_idx; + index = ring.used->ring[head].id; +#endif + *lenp = ring.used->ring[head].len; + datap = data[index].data; + *bufp = (void*)(unsigned long)ring.desc[index].addr; + data[index].data = NULL; + ring.desc[index].next = guest.free_head; + guest.free_head = index; + guest.num_free++; + guest.last_used_idx++; + return datap; +} + +void poll_used(void) +{ +#ifdef RING_POLL + unsigned head = (ring_size - 1) & guest.last_used_idx; + + for (;;) { + unsigned index = ring.used->ring[head].id; + + if ((index ^ guest.last_used_idx ^ 0x8000) & ~(ring_size - 1)) + busy_wait(); + else + break; + } +#else + unsigned head = guest.last_used_idx; + + while (ring.used->idx == head) + busy_wait(); +#endif +} + +void disable_call() +{ + /* Doing nothing to disable calls might cause + * extra interrupts, but reduces the number of cache misses. + */ +} + +bool enable_call() +{ + unsigned short last_used_idx; + + vring_used_event(&ring) = (last_used_idx = guest.last_used_idx); + /* Flush call index write */ + /* Barrier D (for pairing) */ + smp_mb(); +#ifdef RING_POLL + { + unsigned short head = last_used_idx & (ring_size - 1); + unsigned index = ring.used->ring[head].id; + + return (index ^ last_used_idx ^ 0x8000) & ~(ring_size - 1); + } +#else + return ring.used->idx == last_used_idx; +#endif +} + +void kick_available(void) +{ + /* Flush in previous flags write */ + /* Barrier C (for pairing) */ + smp_mb(); + if (!vring_need_event(vring_avail_event(&ring), + guest.avail_idx, + guest.kicked_avail_idx)) + return; + + guest.kicked_avail_idx = guest.avail_idx; + kick(); +} + +/* host side */ +void disable_kick() +{ + /* Doing nothing to disable kicks might cause + * extra interrupts, but reduces the number of cache misses. + */ +} + +bool enable_kick() +{ + unsigned head = host.used_idx; + + vring_avail_event(&ring) = head; + /* Barrier C (for pairing) */ + smp_mb(); +#ifdef RING_POLL + { + unsigned index = ring.avail->ring[head & (ring_size - 1)]; + + return (index ^ head ^ 0x8000) & ~(ring_size - 1); + } +#else + return head == ring.avail->idx; +#endif +} + +void poll_avail(void) +{ + unsigned head = host.used_idx; +#ifdef RING_POLL + for (;;) { + unsigned index = ring.avail->ring[head & (ring_size - 1)]; + if ((index ^ head ^ 0x8000) & ~(ring_size - 1)) + busy_wait(); + else + break; + } +#else + while (ring.avail->idx == head) + busy_wait(); +#endif +} + +bool use_buf(unsigned *lenp, void **bufp) +{ + unsigned used_idx = host.used_idx; + struct vring_desc *desc; + unsigned head; + +#ifdef RING_POLL + head = ring.avail->ring[used_idx & (ring_size - 1)]; + if ((used_idx ^ head ^ 0x8000) & ~(ring_size - 1)) + return false; + /* Barrier A (for pairing) */ + smp_acquire(); + + used_idx &= ring_size - 1; + desc = &ring.desc[head & (ring_size - 1)]; +#else + if (used_idx == ring.avail->idx) + return false; + + /* Barrier A (for pairing) */ + smp_acquire(); + + used_idx &= ring_size - 1; + head = ring.avail->ring[used_idx]; + desc = &ring.desc[head]; +#endif + + *lenp = desc->len; + *bufp = (void *)(unsigned long)desc->addr; + + /* now update used ring */ + ring.used->ring[used_idx].id = head; + ring.used->ring[used_idx].len = desc->len - 1; + /* Barrier B (for pairing) */ + smp_release(); + host.used_idx++; + ring.used->idx = host.used_idx; + + return true; +} + +void call_used(void) +{ + /* Flush in previous flags write */ + /* Barrier D (for pairing) */ + smp_mb(); + if (!vring_need_event(vring_used_event(&ring), + host.used_idx, + host.called_used_idx)) + return; + + host.called_used_idx = host.used_idx; + call(); +} diff --git a/tools/virtio/ringtest/virtio_ring_poll.c b/tools/virtio/ringtest/virtio_ring_poll.c new file mode 100644 index 000000000000..84fc2c557aaa --- /dev/null +++ b/tools/virtio/ringtest/virtio_ring_poll.c @@ -0,0 +1,2 @@ +#define RING_POLL 1 +#include "virtio_ring_0_9.c" |