diff options
-rw-r--r-- | Documentation/i386/IO-APIC.txt | 2 | ||||
-rw-r--r-- | MAINTAINERS | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 39 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/generic.c | 42 | ||||
-rw-r--r-- | arch/x86/kernel/io_delay.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/mfgpt_32.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/setup_32.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/setup_64.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/discontig_32.c | 1 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 2 | ||||
-rw-r--r-- | drivers/acpi/processor_idle.c | 7 | ||||
-rw-r--r-- | drivers/acpi/sbshc.c | 1 | ||||
-rw-r--r-- | drivers/acpi/scan.c | 18 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle.c | 4 | ||||
-rw-r--r-- | drivers/cpuidle/sysfs.c | 10 | ||||
-rw-r--r-- | drivers/pci/setup-bus.c | 5 | ||||
-rw-r--r-- | include/asm-x86/pgtable.h | 2 | ||||
-rw-r--r-- | include/linux/cpuidle.h | 4 | ||||
-rw-r--r-- | include/linux/sched.h | 6 | ||||
-rw-r--r-- | kernel/relay.c | 7 | ||||
-rw-r--r-- | kernel/sched.c | 43 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 2 | ||||
-rw-r--r-- | kernel/timer.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 17 | ||||
-rw-r--r-- | mm/slab.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 21 |
28 files changed, 195 insertions, 82 deletions
diff --git a/Documentation/i386/IO-APIC.txt b/Documentation/i386/IO-APIC.txt index f95166645d29..30b4c714fbe1 100644 --- a/Documentation/i386/IO-APIC.txt +++ b/Documentation/i386/IO-APIC.txt @@ -70,7 +70,7 @@ Every PCI card emits a PCI IRQ, which can be INTA, INTB, INTC or INTD: These INTA-D PCI IRQs are always 'local to the card', their real meaning depends on which slot they are in. If you look at the daisy chaining diagram, -a card in slot4, issuing INTA IRQ, it will end up as a signal on PIRQ2 of +a card in slot4, issuing INTA IRQ, it will end up as a signal on PIRQ4 of the PCI chipset. Most cards issue INTA, this creates optimal distribution between the PIRQ lines. (distributing IRQ sources properly is not a necessity, PCI IRQs can be shared at will, but it's a good for performance diff --git a/MAINTAINERS b/MAINTAINERS index 73883b8bbd76..2f70e5c10ae4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2322,6 +2322,8 @@ P: Anil S Keshavamurthy M: anil.s.keshavamurthy@intel.com P: David S. Miller M: davem@davemloft.net +P: Masami Hiramatsu +M: mhiramat@redhat.com L: linux-kernel@vger.kernel.org S: Maintained diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index f2b5a621d27b..8a85c93bd62a 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -63,7 +63,7 @@ static struct cpufreq_frequency_table speedstep_freqs[] = { */ static int speedstep_smi_ownership (void) { - u32 command, result, magic; + u32 command, result, magic, dummy; u32 function = GET_SPEEDSTEP_OWNER; unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; @@ -73,8 +73,11 @@ static int speedstep_smi_ownership (void) dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port); __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=D" (result) + "pop %%ebp\n" + : "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), + "=S" (dummy) : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic) : "memory" @@ -96,7 +99,7 @@ static int speedstep_smi_ownership (void) */ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) { - u32 command, result = 0, edi, high_mhz, low_mhz; + u32 command, result = 0, edi, high_mhz, low_mhz, dummy; u32 state=0; u32 function = GET_SPEEDSTEP_FREQS; @@ -109,10 +112,12 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port); - __asm__ __volatile__("movl $0, %%edi\n" + __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi) - : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + "pop %%ebp" + : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) ); dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz); @@ -135,16 +140,18 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) static int speedstep_get_state (void) { u32 function=GET_SPEEDSTEP_STATE; - u32 result, state, edi, command; + u32 result, state, edi, command, dummy; command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port); - __asm__ __volatile__("movl $0, %%edi\n" + __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=a" (result), "=b" (state), "=D" (edi) - : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0) + "pop %%ebp\n" + : "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0) ); dprintk("state is %x, result is %x\n", state, result); @@ -160,7 +167,7 @@ static int speedstep_get_state (void) */ static void speedstep_set_state (unsigned int state) { - unsigned int result = 0, command, new_state; + unsigned int result = 0, command, new_state, dummy; unsigned long flags; unsigned int function=SET_SPEEDSTEP_STATE; unsigned int retry = 0; @@ -182,10 +189,12 @@ static void speedstep_set_state (unsigned int state) } retry++; __asm__ __volatile__( - "movl $0, %%edi\n" + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=b" (new_state), "=D" (result) - : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + "pop %%ebp" + : "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy), + "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) ); } while ((new_state != state) && (retry <= SMI_TRIES)); @@ -195,7 +204,7 @@ static void speedstep_set_state (unsigned int state) if (new_state == state) { dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result); } else { - printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result); + printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result); } return; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 103d61a59b19..3e18db4cefee 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -176,12 +176,13 @@ static inline void k8_enable_fixed_iorrs(void) } /** - * Checks and updates an fixed-range MTRR if it differs from the value it - * should have. If K8 extentions are wanted, update the K8 SYSCFG MSR also. - * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information - * \param msr MSR address of the MTTR which should be checked and updated - * \param changed pointer which indicates whether the MTRR needed to be changed - * \param msrwords pointer to the MSR values which the MSR should have + * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * @msr: MSR address of the MTTR which should be checked and updated + * @changed: pointer which indicates whether the MTRR needed to be changed + * @msrwords: pointer to the MSR values which the MSR should have + * + * If K8 extentions are wanted, update the K8 SYSCFG MSR also. + * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information. */ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { @@ -199,12 +200,15 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) } } +/** + * generic_get_free_region - Get a free MTRR. + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * @replace_reg: mtrr index to be replaced; set to invalid value if none. + * + * Returns: The index of the region on success, else negative on error. + */ int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - <base> The starting (base) address of the region. - <size> The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { int i, max; mtrr_type ltype; @@ -249,8 +253,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, } /** - * Checks and updates the fixed-range MTRRs if they differ from the saved set - * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges() + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ static int set_fixed_ranges(mtrr_type * frs) { @@ -294,13 +298,13 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) static u32 deftype_lo, deftype_hi; +/** + * set_mtrr_state - Set the MTRR state for this CPU. + * + * NOTE: The CPU must already be in a safe state for MTRR changes. + * RETURNS: 0 if no changes made, else a mask indicating what was changed. + */ static unsigned long set_mtrr_state(void) -/* [SUMMARY] Set the MTRR state for this CPU. - <state> The MTRR state information to read. - <ctxt> Some relevant CPU context. - [NOTE] The CPU must already be in a safe state for MTRR changes. - [RETURNS] 0 if no changes made, else a mask indication what was changed. -*/ { unsigned int i; unsigned long change_mask = 0; diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index c706a3061553..5921e5f0a640 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -78,6 +78,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { }, { .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B8") + } + }, + { + .callback = dmi_io_delay_0xed_port, .ident = "HP Pavilion tx1000", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 027fc067b399..b402c0f3f192 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -30,6 +30,7 @@ #include <linux/kernel.h> #include <linux/interrupt.h> +#include <linux/module.h> #include <asm/geode.h> static struct mfgpt_timer_t { diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index a1d7071a51c9..2b3e5d45176b 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -406,8 +406,6 @@ static unsigned long __init setup_memory(void) */ min_low_pfn = PFN_UP(init_pg_tables_end); - find_max_pfn(); - max_low_pfn = find_max_low_pfn(); #ifdef CONFIG_HIGHMEM @@ -764,12 +762,13 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); - max_low_pfn = setup_memory(); - /* update e820 for memory not covered by WB MTRRs */ + find_max_pfn(); mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_low_pfn = setup_memory(); + find_max_pfn(); + + max_low_pfn = setup_memory(); #ifdef CONFIG_VMI /* diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 7637dc91c79b..f4f7ecfb898c 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -801,7 +801,7 @@ static void __cpuinit srat_detect_node(void) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE || !node_online(node)) node = first_node(node_online_map); numa_set_node(cpu, node); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c394ca0720b8..8e25e06ff730 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -324,7 +324,6 @@ unsigned long __init setup_memory(void) * this space and use it to adjust the boundary between ZONE_NORMAL * and ZONE_HIGHMEM. */ - find_max_pfn(); get_memcfg_numa(); kva_pages = calculate_numa_remap_pages(); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4afaba0ed722..794895c6dcc9 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -137,7 +137,11 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, switch (mode) { case IOR_MODE_UNCACHED: default: - prot = PAGE_KERNEL_NOCACHE; + /* + * FIXME: we will use UC MINUS for now, as video fb drivers + * depend on it. Upcoming ioremap_wc() will fix this behavior. + */ + prot = PAGE_KERNEL_UC_MINUS; break; case IOR_MODE_CACHED: prot = PAGE_KERNEL; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 14e48b5a94ba..7b79f6be4e7d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -771,7 +771,7 @@ static inline int change_page_attr_clear(unsigned long addr, int numpages, int set_memory_uc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PCD | _PAGE_PWT)); + __pgprot(_PAGE_PCD)); } EXPORT_SYMBOL(set_memory_uc); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index e8e2d8869236..788da9781f80 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -1487,7 +1487,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, return 0; } - acpi_unlazy_tlb(smp_processor_id()); /* * Must be done before busmaster disable as we might need to * access HPET ! @@ -1577,6 +1576,8 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, return 0; } + acpi_unlazy_tlb(smp_processor_id()); + /* Tell the scheduler that we are going deep-idle: */ sched_clock_idle_sleep_event(); /* @@ -1692,7 +1693,9 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr) switch (cx->type) { case ACPI_STATE_C1: state->flags |= CPUIDLE_FLAG_SHALLOW; - state->flags |= CPUIDLE_FLAG_TIME_VALID; + if (cx->entry_method == ACPI_CSTATE_FFH) + state->flags |= CPUIDLE_FLAG_TIME_VALID; + state->enter = acpi_idle_enter_c1; dev->safe_state = state; break; diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c index a2cf3008ce6c..bcf2c70fca87 100644 --- a/drivers/acpi/sbshc.c +++ b/drivers/acpi/sbshc.c @@ -130,7 +130,6 @@ static int acpi_smbus_transaction(struct acpi_smb_hc *hc, u8 protocol, goto end; } smb_hc_write(hc, ACPI_SMB_COMMAND, command); - smb_hc_write(hc, ACPI_SMB_COMMAND, command); if (!(protocol & 0x01)) { smb_hc_write(hc, ACPI_SMB_BLOCK_COUNT, length); for (i = 0; i < length; ++i) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 57570ac47803..e6ce262b5d44 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -39,20 +39,26 @@ static int create_modalias(struct acpi_device *acpi_dev, char *modalias, int size) { int len; + int count; - if (!acpi_dev->flags.hardware_id) + if (!acpi_dev->flags.hardware_id && !acpi_dev->flags.compatible_ids) return -ENODEV; - len = snprintf(modalias, size, "acpi:%s:", - acpi_dev->pnp.hardware_id); - if (len < 0 || len >= size) - return -EINVAL; + len = snprintf(modalias, size, "acpi:"); size -= len; + if (acpi_dev->flags.hardware_id) { + count = snprintf(&modalias[len], size, "%s:", + acpi_dev->pnp.hardware_id); + if (count < 0 || count >= size) + return -EINVAL; + len += count; + size -= count; + } + if (acpi_dev->flags.compatible_ids) { struct acpi_compatible_id_list *cid_list; int i; - int count; cid_list = acpi_dev->pnp.cid_list; for (i = 0; i < cid_list->count; i++) { diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index d73663a52324..fc555a90bb21 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -67,7 +67,7 @@ static void cpuidle_idle_call(void) /* enter the state and update stats */ dev->last_residency = target_state->enter(dev, target_state); dev->last_state = target_state; - target_state->time += dev->last_residency; + target_state->time += (unsigned long long)dev->last_residency; target_state->usage++; /* give the governor an opportunity to reflect on the outcome */ @@ -224,7 +224,7 @@ static void poll_idle_init(struct cpuidle_device *dev) state->exit_latency = 0; state->target_residency = 0; state->power_usage = -1; - state->flags = CPUIDLE_FLAG_POLL | CPUIDLE_FLAG_TIME_VALID; + state->flags = CPUIDLE_FLAG_POLL; state->enter = poll_idle; } #else diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 69102ca05685..e949618b9be0 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -218,6 +218,12 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \ return sprintf(buf, "%u\n", state->_name);\ } +#define define_show_state_ull_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \ +{ \ + return sprintf(buf, "%llu\n", state->_name);\ +} + #define define_show_state_str_function(_name) \ static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \ { \ @@ -228,8 +234,8 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \ define_show_state_function(exit_latency) define_show_state_function(power_usage) -define_show_state_function(usage) -define_show_state_function(time) +define_show_state_ull_function(usage) +define_show_state_ull_function(time) define_show_state_str_function(name) define_show_state_str_function(desc) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 125e7b7f34ff..f7cb8e0758b4 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -486,12 +486,7 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus) break; case PCI_CLASS_BRIDGE_PCI: - /* don't size subtractive decoding (transparent) - * PCI-to-PCI bridges */ - if (bus->self->transparent) - break; pci_bridge_check_ranges(bus); - /* fall through */ default: pbus_size_io(bus); /* If the bridge supports prefetchable range, size it diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 174b87738714..9cf472aeb9ce 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -85,6 +85,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) #define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) @@ -101,6 +102,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX) #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS) #define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE) #define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) #define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 6b72a4584086..51e6b1e520e6 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -38,8 +38,8 @@ struct cpuidle_state { unsigned int power_usage; /* in mW */ unsigned int target_residency; /* in US */ - unsigned int usage; - unsigned int time; /* in US */ + unsigned long long usage; + unsigned long long time; /* in US */ int (*enter) (struct cpuidle_device *dev, struct cpuidle_state *state); diff --git a/include/linux/sched.h b/include/linux/sched.h index fed07d03364e..6a1e7afb099b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1541,6 +1541,12 @@ static inline void idle_task_exit(void) {} extern void sched_idle_next(void); +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +extern void wake_up_idle_cpu(int cpu); +#else +static inline void wake_up_idle_cpu(int cpu) { } +#endif + #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; diff --git a/kernel/relay.c b/kernel/relay.c index 4c035a8a248c..d6204a485818 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) kref_get(&buf->kref); filp->private_data = buf; - return 0; + return nonseekable_open(inode, filp); } /** @@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) +{ +} + /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ @@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, + .spd_release = relay_page_release, }; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) diff --git a/kernel/sched.c b/kernel/sched.c index 28c73f07efb2..8dcdec6fe0fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1052,6 +1052,49 @@ static void resched_cpu(int cpu) resched_task(cpu_curr(cpu)); spin_unlock_irqrestore(&rq->lock, flags); } + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif + #else static void __resched_task(struct task_struct *p, int tif_bit) { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 278534bbca95..7f60097d443a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -174,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (watchdog) del_timer(&watchdog_timer); watchdog = cs; - init_timer_deferrable(&watchdog_timer); + init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; /* Reset watchdog cycles */ diff --git a/kernel/timer.c b/kernel/timer.c index 99b00a25f88b..b024106daa70 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 74c1b6b0b37b..51c9e2c01640 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -401,12 +401,20 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) struct page *page; unsigned long nr_pages; + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes. Iterate across all nodes until we + * can no longer free unreserved surplus pages. This occurs when + * the nodes with surplus pages have no free pages. + */ + unsigned long remaining_iterations = num_online_nodes(); + /* Uncommit the reservation */ resv_huge_pages -= unused_resv_pages; nr_pages = min(unused_resv_pages, surplus_huge_pages); - while (nr_pages) { + while (remaining_iterations-- && nr_pages) { nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); @@ -424,6 +432,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) surplus_huge_pages--; surplus_huge_pages_node[nid]--; nr_pages--; + remaining_iterations = num_online_nodes(); } } } @@ -671,9 +680,11 @@ int hugetlb_report_node_meminfo(int nid, char *buf) { return sprintf(buf, "Node %d HugePages_Total: %5u\n" - "Node %d HugePages_Free: %5u\n", + "Node %d HugePages_Free: %5u\n" + "Node %d HugePages_Surp: %5u\n", nid, nr_huge_pages_node[nid], - nid, free_huge_pages_node[nid]); + nid, free_huge_pages_node[nid], + nid, surplus_huge_pages_node[nid]); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ diff --git a/mm/slab.c b/mm/slab.c index bb4070e1079f..04b308c3bc54 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1481,7 +1481,7 @@ void __init kmem_cache_init(void) list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE]; + cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* * struct kmem_cache size depends on nr_node_ids, which @@ -1602,7 +1602,7 @@ void __init kmem_cache_init(void) int nid; for_each_online_node(nid) { - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], nid); + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); diff --git a/mm/slub.c b/mm/slub.c index ca71d5b81e4a..b72bc98e2dc1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2685,6 +2685,7 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); +#if defined(SLUB_DEBUG) || defined(CONFIG_SLABINFO) static unsigned long count_partial(struct kmem_cache_node *n) { unsigned long flags; @@ -2697,6 +2698,7 @@ static unsigned long count_partial(struct kmem_cache_node *n) spin_unlock_irqrestore(&n->list_lock, flags); return x; } +#endif /* * kmem_cache_shrink removes empty slabs from the partial lists and sorts diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 971271602dd0..c22d6b6f2db4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -322,15 +322,6 @@ next_sge: ctxt->direction = DMA_FROM_DEVICE; clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - if ((ch+1)->rc_discrim == 0) { - /* - * Checked in sq_cq_reap to see if we need to - * be enqueued - */ - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - ctxt->next = hdr_ctxt; - hdr_ctxt->next = head; - } /* Prepare READ WR */ memset(&read_wr, 0, sizeof read_wr); @@ -348,7 +339,17 @@ next_sge: rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], &sgl_offset, read_wr.num_sge); - + if (((ch+1)->rc_discrim == 0) && + (read_wr.num_sge == ch_sge_ary[ch_no].count)) { + /* + * Mark the last RDMA_READ with a bit to + * indicate all RPC data has been fetched from + * the client and the RPC needs to be enqueued. + */ + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + ctxt->next = hdr_ctxt; + hdr_ctxt->next = head; + } /* Post the read */ err = svc_rdma_send(xprt, &read_wr); if (err) { |