diff options
78 files changed, 2271 insertions, 1290 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index bcd88eb7ebcd..3c17b62899f6 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -35,8 +35,14 @@ name pool - The pool where this rbd image resides. The pool-name pair is unique - per rados system. + The name of the storage pool where this rbd image resides. + An rbd image name is unique within its pool. + +pool_id + + The unique identifier for the rbd image's pool. This is + a permanent attribute of the pool. A pool's id will never + change. size diff --git a/MAINTAINERS b/MAINTAINERS index fb036a062a5d..5b44872b64ec 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1789,15 +1789,16 @@ F: arch/powerpc/oprofile/*cell* F: arch/powerpc/platforms/cell/ CEPH DISTRIBUTED FILE SYSTEM CLIENT -M: Sage Weil <sage@newdream.net> +M: Sage Weil <sage@inktank.com> L: ceph-devel@vger.kernel.org -W: http://ceph.newdream.net/ +W: http://ceph.com/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported F: Documentation/filesystems/ceph.txt F: fs/ceph F: net/ceph F: include/linux/ceph +F: include/linux/crush CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: L: linux-usb@vger.kernel.org @@ -5639,10 +5640,12 @@ S: Supported F: arch/hexagon/ RADOS BLOCK DEVICE (RBD) -F: include/linux/qnxtypes.h -M: Yehuda Sadeh <yehuda@hq.newdream.net> -M: Sage Weil <sage@newdream.net> +M: Yehuda Sadeh <yehuda@inktank.com> +M: Sage Weil <sage@inktank.com> +M: Alex Elder <elder@inktank.com> M: ceph-devel@vger.kernel.org +W: http://ceph.com/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported F: drivers/block/rbd.c F: drivers/block/rbd_types.h diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 296cd32466df..76de6b68487c 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -90,6 +90,7 @@ config S390 select HAVE_MEMBLOCK_NODE_MAP select HAVE_CMPXCHG_LOCAL select ARCH_DISCARD_MEMBLOCK + select BUILDTIME_EXTABLE_SORT select ARCH_INLINE_SPIN_TRYLOCK select ARCH_INLINE_SPIN_TRYLOCK_BH select ARCH_INLINE_SPIN_LOCK diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 37d2bf267964..967923dea98d 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -7,6 +7,9 @@ CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_RCU_FAST_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y @@ -35,8 +38,6 @@ CONFIG_MODVERSIONS=y CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y CONFIG_DEFAULT_DEADLINE=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y CONFIG_PREEMPT=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 5c63615f1349..b749c5733657 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -11,7 +11,6 @@ #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/ctl_reg.h> -#include <asm-generic/mm_hooks.h> static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) @@ -58,7 +57,7 @@ static inline void update_mm(struct mm_struct *mm, struct task_struct *tsk) pgd_t *pgd = mm->pgd; S390_lowcore.user_asce = mm->context.asce_bits | __pa(pgd); - if (user_mode != HOME_SPACE_MODE) { + if (addressing_mode != HOME_SPACE_MODE) { /* Load primary space page table origin. */ asm volatile(LCTL_OPCODE" 1,1,%0\n" : : "m" (S390_lowcore.user_asce) ); @@ -91,4 +90,17 @@ static inline void activate_mm(struct mm_struct *prev, switch_mm(prev, next, current); } +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ +#ifdef CONFIG_64BIT + if (oldmm->context.asce_limit < mm->context.asce_limit) + crst_table_downgrade(mm, oldmm->context.asce_limit); +#endif +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + #endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index c40fa91e38a8..11e4e3236937 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -120,7 +120,9 @@ struct stack_frame { regs->psw.mask = psw_user_bits | PSW_MASK_BA; \ regs->psw.addr = new_psw | PSW_ADDR_AMODE; \ regs->gprs[15] = new_stackp; \ + __tlb_flush_mm(current->mm); \ crst_table_downgrade(current->mm, 1UL << 31); \ + update_mm(current->mm, current); \ } while (0) /* Forward declaration, a strange C thing */ diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 57e80534375a..e6859d16ee2d 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -60,7 +60,7 @@ void create_mem_hole(struct mem_chunk memory_chunk[], unsigned long addr, #define SECONDARY_SPACE_MODE 2 #define HOME_SPACE_MODE 3 -extern unsigned int user_mode; +extern unsigned int addressing_mode; /* * Machine features detected in head.S diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 21be961e8a43..ba500d8dc392 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -110,6 +110,7 @@ struct debug_view debug_raw_view = { NULL, NULL }; +EXPORT_SYMBOL(debug_raw_view); struct debug_view debug_hex_ascii_view = { "hex_ascii", @@ -119,6 +120,7 @@ struct debug_view debug_hex_ascii_view = { NULL, NULL }; +EXPORT_SYMBOL(debug_hex_ascii_view); static struct debug_view debug_level_view = { "level", @@ -155,6 +157,7 @@ struct debug_view debug_sprintf_view = { NULL, NULL }; +EXPORT_SYMBOL(debug_sprintf_view); /* used by dump analysis tools to determine version of debug feature */ static unsigned int __used debug_feature_version = __DEBUG_FEATURE_VERSION; @@ -730,6 +733,7 @@ debug_info_t *debug_register(const char *name, int pages_per_area, return debug_register_mode(name, pages_per_area, nr_areas, buf_size, S_IRUSR | S_IWUSR, 0, 0); } +EXPORT_SYMBOL(debug_register); /* * debug_unregister: @@ -748,6 +752,7 @@ debug_unregister(debug_info_t * id) out: return; } +EXPORT_SYMBOL(debug_unregister); /* * debug_set_size: @@ -810,7 +815,7 @@ debug_set_level(debug_info_t* id, int new_level) } spin_unlock_irqrestore(&id->lock,flags); } - +EXPORT_SYMBOL(debug_set_level); /* * proceed_active_entry: @@ -930,7 +935,7 @@ debug_stop_all(void) if (debug_stoppable) debug_active = 0; } - +EXPORT_SYMBOL(debug_stop_all); void debug_set_critical(void) { @@ -963,6 +968,7 @@ debug_event_common(debug_info_t * id, int level, const void *buf, int len) return active; } +EXPORT_SYMBOL(debug_event_common); /* * debug_exception_common: @@ -990,6 +996,7 @@ debug_entry_t return active; } +EXPORT_SYMBOL(debug_exception_common); /* * counts arguments in format string for sprintf view @@ -1043,6 +1050,7 @@ debug_sprintf_event(debug_info_t* id, int level,char *string,...) return active; } +EXPORT_SYMBOL(debug_sprintf_event); /* * debug_sprintf_exception: @@ -1081,25 +1089,7 @@ debug_sprintf_exception(debug_info_t* id, int level,char *string,...) return active; } - -/* - * debug_init: - * - is called exactly once to initialize the debug feature - */ - -static int -__init debug_init(void) -{ - int rc = 0; - - s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table); - mutex_lock(&debug_mutex); - debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT,NULL); - initialized = 1; - mutex_unlock(&debug_mutex); - - return rc; -} +EXPORT_SYMBOL(debug_sprintf_exception); /* * debug_register_view: @@ -1147,6 +1137,7 @@ debug_register_view(debug_info_t * id, struct debug_view *view) out: return rc; } +EXPORT_SYMBOL(debug_register_view); /* * debug_unregister_view: @@ -1176,6 +1167,7 @@ debug_unregister_view(debug_info_t * id, struct debug_view *view) out: return rc; } +EXPORT_SYMBOL(debug_unregister_view); static inline char * debug_get_user_string(const char __user *user_buf, size_t user_len) @@ -1485,6 +1477,7 @@ debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, except_str, entry->id.fields.cpuid, (void *) caller); return rc; } +EXPORT_SYMBOL(debug_dflt_header_fn); /* * prints debug data sprintf-formated: @@ -1533,33 +1526,16 @@ out: } /* - * clean up module + * debug_init: + * - is called exactly once to initialize the debug feature */ -static void __exit debug_exit(void) +static int __init debug_init(void) { - debugfs_remove(debug_debugfs_root_entry); - unregister_sysctl_table(s390dbf_sysctl_header); - return; + s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table); + mutex_lock(&debug_mutex); + debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL); + initialized = 1; + mutex_unlock(&debug_mutex); + return 0; } - -/* - * module definitions - */ postcore_initcall(debug_init); -module_exit(debug_exit); -MODULE_LICENSE("GPL"); - -EXPORT_SYMBOL(debug_register); -EXPORT_SYMBOL(debug_unregister); -EXPORT_SYMBOL(debug_set_level); -EXPORT_SYMBOL(debug_stop_all); -EXPORT_SYMBOL(debug_register_view); -EXPORT_SYMBOL(debug_unregister_view); -EXPORT_SYMBOL(debug_event_common); -EXPORT_SYMBOL(debug_exception_common); -EXPORT_SYMBOL(debug_hex_ascii_view); -EXPORT_SYMBOL(debug_raw_view); -EXPORT_SYMBOL(debug_dflt_header_fn); -EXPORT_SYMBOL(debug_sprintf_view); -EXPORT_SYMBOL(debug_sprintf_exception); -EXPORT_SYMBOL(debug_sprintf_event); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 1f6b428e2762..619c5d350726 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -1531,7 +1531,7 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr) void show_code(struct pt_regs *regs) { - char *mode = (regs->psw.mask & PSW_MASK_PSTATE) ? "User" : "Krnl"; + char *mode = user_mode(regs) ? "User" : "Krnl"; unsigned char code[64]; char buffer[64], *ptr; mm_segment_t old_fs; @@ -1540,7 +1540,7 @@ void show_code(struct pt_regs *regs) /* Get a snapshot of the 64 bytes surrounding the fault address. */ old_fs = get_fs(); - set_fs((regs->psw.mask & PSW_MASK_PSTATE) ? USER_DS : KERNEL_DS); + set_fs(user_mode(regs) ? USER_DS : KERNEL_DS); for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) { addr = regs->psw.addr - 34 + start; if (__copy_from_user(code + start - 2, diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index bc95a8ebd9cc..83c3271c442b 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -455,7 +455,6 @@ void __init startup_init(void) init_kernel_storage_key(); lockdep_init(); lockdep_off(); - sort_main_extable(); setup_lowcore_early(); setup_facility_list(); detect_machine_type(); diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index e64d141555ce..6ffcd3203215 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -1583,7 +1583,7 @@ static struct kset *vmcmd_kset; static void vmcmd_run(struct shutdown_trigger *trigger) { - char *cmd, *next_cmd; + char *cmd; if (strcmp(trigger->name, ON_REIPL_STR) == 0) cmd = vmcmd_on_reboot; @@ -1600,15 +1600,7 @@ static void vmcmd_run(struct shutdown_trigger *trigger) if (strlen(cmd) == 0) return; - do { - next_cmd = strchr(cmd, '\n'); - if (next_cmd) { - next_cmd[0] = 0; - next_cmd += 1; - } - __cpcmd(cmd, NULL, 0, NULL); - cmd = next_cmd; - } while (cmd != NULL); + __cpcmd(cmd, NULL, 0, NULL); } static int vmcmd_init(void) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 743c0f32fe3b..f86c81e13c37 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -302,8 +302,8 @@ static int __init parse_vmalloc(char *arg) } early_param("vmalloc", parse_vmalloc); -unsigned int user_mode = HOME_SPACE_MODE; -EXPORT_SYMBOL_GPL(user_mode); +unsigned int addressing_mode = HOME_SPACE_MODE; +EXPORT_SYMBOL_GPL(addressing_mode); static int set_amode_primary(void) { @@ -328,7 +328,7 @@ static int set_amode_primary(void) */ static int __init early_parse_switch_amode(char *p) { - user_mode = PRIMARY_SPACE_MODE; + addressing_mode = PRIMARY_SPACE_MODE; return 0; } early_param("switch_amode", early_parse_switch_amode); @@ -336,9 +336,9 @@ early_param("switch_amode", early_parse_switch_amode); static int __init early_parse_user_mode(char *p) { if (p && strcmp(p, "primary") == 0) - user_mode = PRIMARY_SPACE_MODE; + addressing_mode = PRIMARY_SPACE_MODE; else if (!p || strcmp(p, "home") == 0) - user_mode = HOME_SPACE_MODE; + addressing_mode = HOME_SPACE_MODE; else return 1; return 0; @@ -347,7 +347,7 @@ early_param("user_mode", early_parse_user_mode); static void setup_addressing_mode(void) { - if (user_mode == PRIMARY_SPACE_MODE) { + if (addressing_mode == PRIMARY_SPACE_MODE) { if (set_amode_primary()) pr_info("Address spaces switched, " "mvcos available\n"); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index af2421a0f315..01775c04a90e 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -185,7 +185,7 @@ void show_registers(struct pt_regs *regs) { char *mode; - mode = (regs->psw.mask & PSW_MASK_PSTATE) ? "User" : "Krnl"; + mode = user_mode(regs) ? "User" : "Krnl"; printk("%s PSW : %p %p", mode, (void *) regs->psw.mask, (void *) regs->psw.addr); @@ -225,7 +225,7 @@ void show_regs(struct pt_regs *regs) (void *) current->thread.ksp); show_registers(regs); /* Show stack backtrace if pt_regs is from kernel mode */ - if (!(regs->psw.mask & PSW_MASK_PSTATE)) + if (!user_mode(regs)) show_trace(NULL, (unsigned long *) regs->gprs[15]); show_last_breaking_event(regs); } @@ -300,7 +300,7 @@ static void __kprobes do_trap(struct pt_regs *regs, regs->int_code, si_signo) == NOTIFY_STOP) return; - if (regs->psw.mask & PSW_MASK_PSTATE) { + if (user_mode(regs)) { info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; @@ -341,7 +341,7 @@ void __kprobes do_per_trap(struct pt_regs *regs) static void default_trap_handler(struct pt_regs *regs) { - if (regs->psw.mask & PSW_MASK_PSTATE) { + if (user_mode(regs)) { report_user_fault(regs, SIGSEGV); do_exit(SIGSEGV); } else @@ -410,7 +410,7 @@ static void __kprobes illegal_op(struct pt_regs *regs) location = get_psw_address(regs); - if (regs->psw.mask & PSW_MASK_PSTATE) { + if (user_mode(regs)) { if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) return; if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { @@ -478,7 +478,7 @@ void specification_exception(struct pt_regs *regs) location = (__u16 __user *) get_psw_address(regs); - if (regs->psw.mask & PSW_MASK_PSTATE) { + if (user_mode(regs)) { get_user(*((__u16 *) opcode), location); switch (opcode[0]) { case 0x28: /* LDR Rx,Ry */ @@ -531,7 +531,7 @@ static void data_exception(struct pt_regs *regs) asm volatile("stfpc %0" : "=m" (current->thread.fp_regs.fpc)); #ifdef CONFIG_MATHEMU - else if (regs->psw.mask & PSW_MASK_PSTATE) { + else if (user_mode(regs)) { __u8 opcode[6]; get_user(*((__u16 *) opcode), location); switch (opcode[0]) { @@ -598,7 +598,7 @@ static void data_exception(struct pt_regs *regs) static void space_switch_exception(struct pt_regs *regs) { /* Set user psw back to home space mode. */ - if (regs->psw.mask & PSW_MASK_PSTATE) + if (user_mode(regs)) regs->psw.mask |= PSW_ASC_HOME; /* Send SIGILL. */ do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event"); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index ea5590fdca3b..9a19ca367c17 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -84,7 +84,8 @@ struct vdso_data *vdso_data = &vdso_data_store.data; */ static void vdso_init_data(struct vdso_data *vd) { - vd->ectg_available = user_mode != HOME_SPACE_MODE && test_facility(31); + vd->ectg_available = + addressing_mode != HOME_SPACE_MODE && test_facility(31); } #ifdef CONFIG_64BIT @@ -101,7 +102,7 @@ int vdso_alloc_per_cpu(struct _lowcore *lowcore) lowcore->vdso_per_cpu_data = __LC_PASTE; - if (user_mode == HOME_SPACE_MODE || !vdso_enabled) + if (addressing_mode == HOME_SPACE_MODE || !vdso_enabled) return 0; segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); @@ -146,7 +147,7 @@ void vdso_free_per_cpu(struct _lowcore *lowcore) unsigned long segment_table, page_table, page_frame; u32 *psal, *aste; - if (user_mode == HOME_SPACE_MODE || !vdso_enabled) + if (addressing_mode == HOME_SPACE_MODE || !vdso_enabled) return; psal = (u32 *)(addr_t) lowcore->paste[4]; @@ -164,7 +165,7 @@ static void vdso_init_cr5(void) { unsigned long cr5; - if (user_mode == HOME_SPACE_MODE || !vdso_enabled) + if (addressing_mode == HOME_SPACE_MODE || !vdso_enabled) return; cr5 = offsetof(struct _lowcore, paste); __ctl_load(cr5, 5, 5); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 21109c63eb12..de8fa9bbd35e 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -45,7 +45,7 @@ SECTIONS .dummy : { *(.dummy) } :data - RODATA + RO_DATA_SECTION(PAGE_SIZE) #ifdef CONFIG_SHARED_KERNEL . = ALIGN(0x100000); /* VM shared segments are 1MB aligned */ diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6a12d1bb6e09..6c013f544146 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -49,6 +49,7 @@ #define VM_FAULT_BADCONTEXT 0x010000 #define VM_FAULT_BADMAP 0x020000 #define VM_FAULT_BADACCESS 0x040000 +#define VM_FAULT_SIGNAL 0x080000 static unsigned long store_indication; @@ -110,7 +111,7 @@ static inline int user_space_fault(unsigned long trans_exc_code) if (trans_exc_code == 2) /* Access via secondary space, set_fs setting decides */ return current->thread.mm_segment.ar4; - if (user_mode == HOME_SPACE_MODE) + if (addressing_mode == HOME_SPACE_MODE) /* User space if the access has been done via home space. */ return trans_exc_code == 3; /* @@ -219,7 +220,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault) case VM_FAULT_BADACCESS: case VM_FAULT_BADMAP: /* Bad memory access. Check if it is kernel or user space. */ - if (regs->psw.mask & PSW_MASK_PSTATE) { + if (user_mode(regs)) { /* User mode accesses just cause a SIGSEGV */ si_code = (fault == VM_FAULT_BADMAP) ? SEGV_MAPERR : SEGV_ACCERR; @@ -229,15 +230,19 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault) case VM_FAULT_BADCONTEXT: do_no_context(regs); break; + case VM_FAULT_SIGNAL: + if (!user_mode(regs)) + do_no_context(regs); + break; default: /* fault & VM_FAULT_ERROR */ if (fault & VM_FAULT_OOM) { - if (!(regs->psw.mask & PSW_MASK_PSTATE)) + if (!user_mode(regs)) do_no_context(regs); else pagefault_out_of_memory(); } else if (fault & VM_FAULT_SIGBUS) { /* Kernel mode? Handle exceptions or die */ - if (!(regs->psw.mask & PSW_MASK_PSTATE)) + if (!user_mode(regs)) do_no_context(regs); else do_sigbus(regs); @@ -286,7 +291,7 @@ static inline int do_exception(struct pt_regs *regs, int access) address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - flags = FAULT_FLAG_ALLOW_RETRY; + flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) flags |= FAULT_FLAG_WRITE; down_read(&mm->mmap_sem); @@ -335,6 +340,11 @@ retry: * the fault. */ fault = handle_mm_fault(mm, vma, address, flags); + /* No reason to continue if interrupted by SIGKILL. */ + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + fault = VM_FAULT_SIGNAL; + goto out; + } if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; @@ -426,7 +436,7 @@ void __kprobes do_asce_exception(struct pt_regs *regs) } /* User mode accesses just cause a SIGSEGV */ - if (regs->psw.mask & PSW_MASK_PSTATE) { + if (user_mode(regs)) { do_sigsegv(regs, SEGV_MAPERR); return; } @@ -441,6 +451,7 @@ int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) struct pt_regs regs; int access, fault; + /* Emulate a uaccess fault from kernel mode. */ regs.psw.mask = psw_kernel_bits | PSW_MASK_DAT | PSW_MASK_MCHECK; if (!irqs_disabled()) regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; @@ -450,12 +461,12 @@ int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) regs.int_parm_long = (uaddr & PAGE_MASK) | 2; access = write ? VM_WRITE : VM_READ; fault = do_exception(®s, access); - if (unlikely(fault)) { - if (fault & VM_FAULT_OOM) - return -EFAULT; - else if (fault & VM_FAULT_SIGBUS) - do_sigbus(®s); - } + /* + * Since the fault happened in kernel mode while performing a uaccess + * all we need to do now is emulating a fixup in case "fault" is not + * zero. + * For the calling uaccess functions this results always in -EFAULT. + */ return fault ? -EFAULT : 0; } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 573384256c5c..c59a5efa58b1 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -103,9 +103,15 @@ void arch_pick_mmap_layout(struct mm_struct *mm) int s390_mmap_check(unsigned long addr, unsigned long len) { + int rc; + if (!is_compat_task() && - len >= TASK_SIZE && TASK_SIZE < (1UL << 53)) - return crst_table_upgrade(current->mm, 1UL << 53); + len >= TASK_SIZE && TASK_SIZE < (1UL << 53)) { + rc = crst_table_upgrade(current->mm, 1UL << 53); + if (rc) + return rc; + update_mm(current->mm, current); + } return 0; } @@ -125,6 +131,7 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr, rc = crst_table_upgrade(mm, 1UL << 53); if (rc) return (unsigned long) rc; + update_mm(mm, current); area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); } return area; @@ -147,6 +154,7 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, rc = crst_table_upgrade(mm, 1UL << 53); if (rc) return (unsigned long) rc; + update_mm(mm, current); area = arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 1cab221077cc..18df31d1f2c9 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -85,7 +85,6 @@ repeat: crst_table_free(mm, table); if (mm->context.asce_limit < limit) goto repeat; - update_mm(mm, current); return 0; } @@ -93,9 +92,6 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) { pgd_t *pgd; - if (mm->context.asce_limit <= limit) - return; - __tlb_flush_mm(mm); while (mm->context.asce_limit > limit) { pgd = mm->pgd; switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { @@ -118,7 +114,6 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) mm->task_size = mm->context.asce_limit; crst_table_free(mm, (unsigned long *) pgd); } - update_mm(mm, current); } #endif @@ -801,7 +796,7 @@ int s390_enable_sie(void) struct mm_struct *mm, *old_mm; /* Do we have switched amode? If no, we cannot do sie */ - if (user_mode == HOME_SPACE_MODE) + if (addressing_mode == HOME_SPACE_MODE) return -EINVAL; /* Do we have pgstes? if yes, we are done */ diff --git a/arch/s390/oprofile/backtrace.c b/arch/s390/oprofile/backtrace.c index c82f62fb9c28..8a6811b2cdb9 100644 --- a/arch/s390/oprofile/backtrace.c +++ b/arch/s390/oprofile/backtrace.c @@ -58,7 +58,7 @@ void s390_backtrace(struct pt_regs * const regs, unsigned int depth) unsigned long head; struct stack_frame* head_sf; - if (user_mode (regs)) + if (user_mode(regs)) return; head = regs->gprs[15]; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8f428a8ab003..9917943a3572 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -55,8 +55,6 @@ #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ -#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX)) -#define RBD_MAX_POOL_NAME_LEN 64 #define RBD_MAX_SNAP_NAME_LEN 32 #define RBD_MAX_OPT_LEN 1024 @@ -78,13 +76,12 @@ */ struct rbd_image_header { u64 image_size; - char block_name[32]; + char *object_prefix; __u8 obj_order; __u8 crypt_type; __u8 comp_type; struct ceph_snap_context *snapc; size_t snap_names_len; - u64 snap_seq; u32 total_snaps; char *snap_names; @@ -150,7 +147,7 @@ struct rbd_snap { * a single device */ struct rbd_device { - int id; /* blkdev unique id */ + int dev_id; /* blkdev unique id */ int major; /* blkdev assigned major */ struct gendisk *disk; /* blkdev's gendisk and rq */ @@ -163,20 +160,24 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; - char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ - int obj_len; - char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ - char pool_name[RBD_MAX_POOL_NAME_LEN]; - int poolid; + char *image_name; + size_t image_name_len; + char *header_name; + char *pool_name; + int pool_id; struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; /* protects updating the header */ struct rw_semaphore header_rwsem; - char snap_name[RBD_MAX_SNAP_NAME_LEN]; + /* name of the snapshot this device reads from */ + char *snap_name; + /* id of the snapshot this device reads from */ u64 snap_id; /* current snapshot id */ - int read_only; + /* whether the snap_id this device reads from still exists */ + bool snap_exists; + int read_only; struct list_head node; @@ -201,8 +202,7 @@ static ssize_t rbd_snap_add(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); -static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap); +static void __rbd_remove_snap_dev(struct rbd_snap *snap); static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); @@ -240,7 +240,7 @@ static void rbd_put_dev(struct rbd_device *rbd_dev) put_device(&rbd_dev->dev); } -static int __rbd_refresh_header(struct rbd_device *rbd_dev); +static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -273,9 +273,9 @@ static const struct block_device_operations rbd_bd_ops = { /* * Initialize an rbd client instance. - * We own *opt. + * We own *ceph_opts. */ -static struct rbd_client *rbd_client_create(struct ceph_options *opt, +static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, struct rbd_options *rbd_opts) { struct rbd_client *rbdc; @@ -291,10 +291,10 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbdc->client = ceph_create_client(opt, rbdc, 0, 0); + rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); if (IS_ERR(rbdc->client)) goto out_mutex; - opt = NULL; /* Now rbdc->client is responsible for opt */ + ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ ret = ceph_open_session(rbdc->client); if (ret < 0) @@ -317,23 +317,23 @@ out_mutex: mutex_unlock(&ctl_mutex); kfree(rbdc); out_opt: - if (opt) - ceph_destroy_options(opt); + if (ceph_opts) + ceph_destroy_options(ceph_opts); return ERR_PTR(ret); } /* * Find a ceph client with specific addr and configuration. */ -static struct rbd_client *__rbd_client_find(struct ceph_options *opt) +static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts) { struct rbd_client *client_node; - if (opt->flags & CEPH_OPT_NOSHARE) + if (ceph_opts->flags & CEPH_OPT_NOSHARE) return NULL; list_for_each_entry(client_node, &rbd_client_list, node) - if (ceph_compare_options(opt, client_node->client) == 0) + if (!ceph_compare_options(ceph_opts, client_node->client)) return client_node; return NULL; } @@ -349,7 +349,7 @@ enum { /* string args above */ }; -static match_table_t rbdopt_tokens = { +static match_table_t rbd_opts_tokens = { {Opt_notify_timeout, "notify_timeout=%d"}, /* int args above */ /* string args above */ @@ -358,11 +358,11 @@ static match_table_t rbdopt_tokens = { static int parse_rbd_opts_token(char *c, void *private) { - struct rbd_options *rbdopt = private; + struct rbd_options *rbd_opts = private; substring_t argstr[MAX_OPT_ARGS]; int token, intval, ret; - token = match_token(c, rbdopt_tokens, argstr); + token = match_token(c, rbd_opts_tokens, argstr); if (token < 0) return -EINVAL; @@ -383,7 +383,7 @@ static int parse_rbd_opts_token(char *c, void *private) switch (token) { case Opt_notify_timeout: - rbdopt->notify_timeout = intval; + rbd_opts->notify_timeout = intval; break; default: BUG_ON(token); @@ -400,7 +400,7 @@ static struct rbd_client *rbd_get_client(const char *mon_addr, char *options) { struct rbd_client *rbdc; - struct ceph_options *opt; + struct ceph_options *ceph_opts; struct rbd_options *rbd_opts; rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); @@ -409,29 +409,29 @@ static struct rbd_client *rbd_get_client(const char *mon_addr, rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; - opt = ceph_parse_options(options, mon_addr, - mon_addr + mon_addr_len, - parse_rbd_opts_token, rbd_opts); - if (IS_ERR(opt)) { + ceph_opts = ceph_parse_options(options, mon_addr, + mon_addr + mon_addr_len, + parse_rbd_opts_token, rbd_opts); + if (IS_ERR(ceph_opts)) { kfree(rbd_opts); - return ERR_CAST(opt); + return ERR_CAST(ceph_opts); } spin_lock(&rbd_client_list_lock); - rbdc = __rbd_client_find(opt); + rbdc = __rbd_client_find(ceph_opts); if (rbdc) { /* using an existing client */ kref_get(&rbdc->kref); spin_unlock(&rbd_client_list_lock); - ceph_destroy_options(opt); + ceph_destroy_options(ceph_opts); kfree(rbd_opts); return rbdc; } spin_unlock(&rbd_client_list_lock); - rbdc = rbd_client_create(opt, rbd_opts); + rbdc = rbd_client_create(ceph_opts, rbd_opts); if (IS_ERR(rbdc)) kfree(rbd_opts); @@ -480,46 +480,60 @@ static void rbd_coll_release(struct kref *kref) kfree(coll); } +static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) +{ + return !memcmp(&ondisk->text, + RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)); +} + /* * Create a new header structure, translate header format from the on-disk * header. */ static int rbd_header_from_disk(struct rbd_image_header *header, struct rbd_image_header_ondisk *ondisk, - u32 allocated_snaps, - gfp_t gfp_flags) + u32 allocated_snaps) { - u32 i, snap_count; + u32 snap_count; - if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) + if (!rbd_dev_ondisk_valid(ondisk)) return -ENXIO; snap_count = le32_to_cpu(ondisk->snap_count); - if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context)) - / sizeof (*ondisk)) + if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) + / sizeof (u64)) return -EINVAL; header->snapc = kmalloc(sizeof(struct ceph_snap_context) + snap_count * sizeof(u64), - gfp_flags); + GFP_KERNEL); if (!header->snapc) return -ENOMEM; - header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); if (snap_count) { + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); header->snap_names = kmalloc(header->snap_names_len, - gfp_flags); + GFP_KERNEL); if (!header->snap_names) goto err_snapc; header->snap_sizes = kmalloc(snap_count * sizeof(u64), - gfp_flags); + GFP_KERNEL); if (!header->snap_sizes) goto err_names; } else { + WARN_ON(ondisk->snap_names_len); + header->snap_names_len = 0; header->snap_names = NULL; header->snap_sizes = NULL; } - memcpy(header->block_name, ondisk->block_name, + + header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1, + GFP_KERNEL); + if (!header->object_prefix) + goto err_sizes; + + memcpy(header->object_prefix, ondisk->block_name, sizeof(ondisk->block_name)); + header->object_prefix[sizeof (ondisk->block_name)] = '\0'; header->image_size = le64_to_cpu(ondisk->image_size); header->obj_order = ondisk->options.order; @@ -527,11 +541,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->comp_type = ondisk->options.comp_type; atomic_set(&header->snapc->nref, 1); - header->snap_seq = le64_to_cpu(ondisk->snap_seq); + header->snapc->seq = le64_to_cpu(ondisk->snap_seq); header->snapc->num_snaps = snap_count; header->total_snaps = snap_count; if (snap_count && allocated_snaps == snap_count) { + int i; + for (i = 0; i < snap_count; i++) { header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); @@ -540,16 +556,22 @@ static int rbd_header_from_disk(struct rbd_image_header *header, } /* copy snapshot names */ - memcpy(header->snap_names, &ondisk->snaps[i], + memcpy(header->snap_names, &ondisk->snaps[snap_count], header->snap_names_len); } return 0; +err_sizes: + kfree(header->snap_sizes); + header->snap_sizes = NULL; err_names: kfree(header->snap_names); + header->snap_names = NULL; err_snapc: kfree(header->snapc); + header->snapc = NULL; + return -ENOMEM; } @@ -575,52 +597,50 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, return -ENOENT; } -static int rbd_header_set_snap(struct rbd_device *dev, u64 *size) +static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) { - struct rbd_image_header *header = &dev->header; - struct ceph_snap_context *snapc = header->snapc; - int ret = -ENOENT; - - BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME)); + int ret; - down_write(&dev->header_rwsem); + down_write(&rbd_dev->header_rwsem); - if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME, + if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { - if (header->total_snaps) - snapc->seq = header->snap_seq; - else - snapc->seq = 0; - dev->snap_id = CEPH_NOSNAP; - dev->read_only = 0; + rbd_dev->snap_id = CEPH_NOSNAP; + rbd_dev->snap_exists = false; + rbd_dev->read_only = 0; if (size) - *size = header->image_size; + *size = rbd_dev->header.image_size; } else { - ret = snap_by_name(header, dev->snap_name, &snapc->seq, size); + u64 snap_id = 0; + + ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name, + &snap_id, size); if (ret < 0) goto done; - dev->snap_id = snapc->seq; - dev->read_only = 1; + rbd_dev->snap_id = snap_id; + rbd_dev->snap_exists = true; + rbd_dev->read_only = 1; } ret = 0; done: - up_write(&dev->header_rwsem); + up_write(&rbd_dev->header_rwsem); return ret; } static void rbd_header_free(struct rbd_image_header *header) { - kfree(header->snapc); - kfree(header->snap_names); + kfree(header->object_prefix); kfree(header->snap_sizes); + kfree(header->snap_names); + ceph_put_snap_context(header->snapc); } /* * get the actual striped segment name, offset and length */ static u64 rbd_get_segment(struct rbd_image_header *header, - const char *block_name, + const char *object_prefix, u64 ofs, u64 len, char *seg_name, u64 *segofs) { @@ -628,7 +648,7 @@ static u64 rbd_get_segment(struct rbd_image_header *header, if (seg_name) snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, - "%s.%012llx", block_name, seg); + "%s.%012llx", object_prefix, seg); ofs = ofs & ((1 << header->obj_order) - 1); len = min_t(u64, len, (1 << header->obj_order) - ofs); @@ -726,9 +746,8 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, * split_bio will BUG_ON if this is not the case */ dout("bio_chain_clone split! total=%d remaining=%d" - "bi_size=%d\n", - (int)total, (int)len-total, - (int)old_chain->bi_size); + "bi_size=%u\n", + total, len - total, old_chain->bi_size); /* split the bio. We'll release it either in the next call, or it will have to be released outside */ @@ -777,22 +796,24 @@ err_out: /* * helpers for osd request op vectors. */ -static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, - int num_ops, - int opcode, - u32 payload_len) -{ - *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), - GFP_NOIO); - if (!*ops) - return -ENOMEM; - (*ops)[0].op = opcode; +static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, + int opcode, u32 payload_len) +{ + struct ceph_osd_req_op *ops; + + ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); + if (!ops) + return NULL; + + ops[0].op = opcode; + /* * op extent offset and length will be set later on * in calc_raw_layout() */ - (*ops)[0].payload_len = payload_len; - return 0; + ops[0].payload_len = payload_len; + + return ops; } static void rbd_destroy_ops(struct ceph_osd_req_op *ops) @@ -808,8 +829,8 @@ static void rbd_coll_end_req_index(struct request *rq, struct request_queue *q; int min, max, i; - dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", - coll, index, ret, len); + dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", + coll, index, ret, (unsigned long long) len); if (!rq) return; @@ -848,16 +869,15 @@ static void rbd_coll_end_req(struct rbd_request *req, * Send ceph osd request */ static int rbd_do_request(struct request *rq, - struct rbd_device *dev, + struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - const char *obj, u64 ofs, u64 len, + const char *object_name, u64 ofs, u64 len, struct bio *bio, struct page **pages, int num_pages, int flags, struct ceph_osd_req_op *ops, - int num_reply, struct rbd_req_coll *coll, int coll_index, void (*rbd_cb)(struct ceph_osd_request *req, @@ -887,15 +907,13 @@ static int rbd_do_request(struct request *rq, req_data->coll_index = coll_index; } - dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); - - down_read(&dev->header_rwsem); + dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, + (unsigned long long) ofs, (unsigned long long) len); - osdc = &dev->rbd_client->client->osdc; + osdc = &rbd_dev->rbd_client->client->osdc; req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, false, GFP_NOIO, pages, bio); if (!req) { - up_read(&dev->header_rwsem); ret = -ENOMEM; goto done_pages; } @@ -912,7 +930,7 @@ static int rbd_do_request(struct request *rq, reqhead = req->r_request->front.iov_base; reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); - strncpy(req->r_oid, obj, sizeof(req->r_oid)); + strncpy(req->r_oid, object_name, sizeof(req->r_oid)); req->r_oid_len = strlen(req->r_oid); layout = &req->r_file_layout; @@ -920,7 +938,7 @@ static int rbd_do_request(struct request *rq, layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32(dev->poolid); + layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, req, ops); @@ -929,7 +947,6 @@ static int rbd_do_request(struct request *rq, snapc, &mtime, req->r_oid, req->r_oid_len); - up_read(&dev->header_rwsem); if (linger_req) { ceph_osdc_set_request_linger(osdc, req); @@ -944,8 +961,9 @@ static int rbd_do_request(struct request *rq, ret = ceph_osdc_wait_request(osdc, req); if (ver) *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%lld\n", - le64_to_cpu(req->r_reassert_version.version)); + dout("reassert_ver=%llu\n", + (unsigned long long) + le64_to_cpu(req->r_reassert_version.version)); ceph_osdc_put_request(req); } return ret; @@ -979,7 +997,8 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) bytes = le64_to_cpu(op->extent.length); read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); - dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); + dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", + (unsigned long long) bytes, read_op, (int) rc); if (rc == -ENOENT && read_op) { zero_bio_chain(req_data->bio, 0); @@ -1006,14 +1025,12 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg /* * Do a synchronous ceph osd operation */ -static int rbd_req_sync_op(struct rbd_device *dev, +static int rbd_req_sync_op(struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - int opcode, int flags, - struct ceph_osd_req_op *orig_ops, - int num_reply, - const char *obj, + struct ceph_osd_req_op *ops, + const char *object_name, u64 ofs, u64 len, char *buf, struct ceph_osd_request **linger_req, @@ -1022,45 +1039,28 @@ static int rbd_req_sync_op(struct rbd_device *dev, int ret; struct page **pages; int num_pages; - struct ceph_osd_req_op *ops = orig_ops; - u32 payload_len; + + BUG_ON(ops == NULL); num_pages = calc_pages_for(ofs , len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); - if (!orig_ops) { - payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); - ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); - if (ret < 0) - goto done; - - if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { - ret = ceph_copy_to_page_vector(pages, buf, ofs, len); - if (ret < 0) - goto done_ops; - } - } - - ret = rbd_do_request(NULL, dev, snapc, snapid, - obj, ofs, len, NULL, + ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, + object_name, ofs, len, NULL, pages, num_pages, flags, ops, - 2, NULL, 0, NULL, linger_req, ver); if (ret < 0) - goto done_ops; + goto done; if ((flags & CEPH_OSD_FLAG_READ) && buf) ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); -done_ops: - if (!orig_ops) - rbd_destroy_ops(ops); done: ceph_release_page_vector(pages, num_pages); return ret; @@ -1070,10 +1070,10 @@ done: * Do an asynchronous ceph osd operation */ static int rbd_do_op(struct request *rq, - struct rbd_device *rbd_dev , + struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - int opcode, int flags, int num_reply, + int opcode, int flags, u64 ofs, u64 len, struct bio *bio, struct rbd_req_coll *coll, @@ -1091,14 +1091,15 @@ static int rbd_do_op(struct request *rq, return -ENOMEM; seg_len = rbd_get_segment(&rbd_dev->header, - rbd_dev->header.block_name, + rbd_dev->header.object_prefix, ofs, len, seg_name, &seg_ofs); payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); - ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); - if (ret < 0) + ret = -ENOMEM; + ops = rbd_create_rw_ops(1, opcode, payload_len); + if (!ops) goto done; /* we've taken care of segment sizes earlier when we @@ -1112,7 +1113,6 @@ static int rbd_do_op(struct request *rq, NULL, 0, flags, ops, - num_reply, coll, coll_index, rbd_req_cb, 0, NULL); @@ -1136,7 +1136,6 @@ static int rbd_req_write(struct request *rq, return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 2, ofs, len, bio, coll, coll_index); } @@ -1155,55 +1154,58 @@ static int rbd_req_read(struct request *rq, snapid, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - 2, ofs, len, bio, coll, coll_index); } /* * Request sync osd read */ -static int rbd_req_sync_read(struct rbd_device *dev, - struct ceph_snap_context *snapc, +static int rbd_req_sync_read(struct rbd_device *rbd_dev, u64 snapid, - const char *obj, + const char *object_name, u64 ofs, u64 len, char *buf, u64 *ver) { - return rbd_req_sync_op(dev, NULL, + struct ceph_osd_req_op *ops; + int ret; + + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); + if (!ops) + return -ENOMEM; + + ret = rbd_req_sync_op(rbd_dev, NULL, snapid, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, - 1, obj, ofs, len, buf, NULL, ver); + ops, object_name, ofs, len, buf, NULL, ver); + rbd_destroy_ops(ops); + + return ret; } /* * Request sync osd watch */ -static int rbd_req_sync_notify_ack(struct rbd_device *dev, +static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, u64 ver, - u64 notify_id, - const char *obj) + u64 notify_id) { struct ceph_osd_req_op *ops; - struct page **pages = NULL; int ret; - ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); + if (!ops) + return -ENOMEM; - ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); + ops[0].watch.ver = cpu_to_le64(ver); ops[0].watch.cookie = notify_id; ops[0].watch.flag = 0; - ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, - obj, 0, 0, NULL, - pages, 0, + ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, + rbd_dev->header_name, 0, 0, NULL, + NULL, 0, CEPH_OSD_FLAG_READ, ops, - 1, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1213,54 +1215,53 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { - struct rbd_device *dev = (struct rbd_device *)data; + struct rbd_device *rbd_dev = (struct rbd_device *)data; + u64 hver; int rc; - if (!dev) + if (!rbd_dev) return; - dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, - notify_id, (int)opcode); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rc = __rbd_refresh_header(dev); - mutex_unlock(&ctl_mutex); + dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", + rbd_dev->header_name, (unsigned long long) notify_id, + (unsigned int) opcode); + rc = rbd_refresh_header(rbd_dev, &hver); if (rc) pr_warning(RBD_DRV_NAME "%d got notification but failed to " - " update snaps: %d\n", dev->major, rc); + " update snaps: %d\n", rbd_dev->major, rc); - rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); + rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); } /* * Request sync osd watch */ -static int rbd_req_sync_watch(struct rbd_device *dev, - const char *obj, - u64 ver) +static int rbd_req_sync_watch(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + int ret; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); + if (!ops) + return -ENOMEM; ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, - (void *)dev, &dev->watch_event); + (void *)rbd_dev, &rbd_dev->watch_event); if (ret < 0) goto fail; - ops[0].watch.ver = cpu_to_le64(ver); - ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); + ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); + ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 1; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, - &dev->watch_request, NULL); + rbd_dev->header_name, + 0, 0, NULL, + &rbd_dev->watch_request, NULL); if (ret < 0) goto fail_event; @@ -1269,8 +1270,8 @@ static int rbd_req_sync_watch(struct rbd_device *dev, return 0; fail_event: - ceph_osdc_cancel_event(dev->watch_event); - dev->watch_event = NULL; + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; fail: rbd_destroy_ops(ops); return ret; @@ -1279,64 +1280,65 @@ fail: /* * Request sync osd unwatch */ -static int rbd_req_sync_unwatch(struct rbd_device *dev, - const char *obj) +static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; + int ret; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); + if (!ops) + return -ENOMEM; ops[0].watch.ver = 0; - ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); + ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 0; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, NULL); + rbd_dev->header_name, + 0, 0, NULL, NULL, NULL); + rbd_destroy_ops(ops); - ceph_osdc_cancel_event(dev->watch_event); - dev->watch_event = NULL; + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; return ret; } struct rbd_notify_info { - struct rbd_device *dev; + struct rbd_device *rbd_dev; }; static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { - struct rbd_device *dev = (struct rbd_device *)data; - if (!dev) + struct rbd_device *rbd_dev = (struct rbd_device *)data; + if (!rbd_dev) return; - dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, - notify_id, (int)opcode); + dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n", + rbd_dev->header_name, (unsigned long long) notify_id, + (unsigned int) opcode); } /* * Request sync osd notify */ -static int rbd_req_sync_notify(struct rbd_device *dev, - const char *obj) +static int rbd_req_sync_notify(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_event *event; struct rbd_notify_info info; int payload_len = sizeof(u32) + sizeof(u32); int ret; - ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len); + if (!ops) + return -ENOMEM; - info.dev = dev; + info.rbd_dev = rbd_dev; ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, (void *)&info, &event); @@ -1349,12 +1351,12 @@ static int rbd_req_sync_notify(struct rbd_device *dev, ops[0].watch.prot_ver = RADOS_NOTIFY_VER; ops[0].watch.timeout = 12; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, NULL); + rbd_dev->header_name, + 0, 0, NULL, NULL, NULL); if (ret < 0) goto fail_event; @@ -1373,36 +1375,37 @@ fail: /* * Request sync osd read */ -static int rbd_req_sync_exec(struct rbd_device *dev, - const char *obj, - const char *cls, - const char *method, +static int rbd_req_sync_exec(struct rbd_device *rbd_dev, + const char *object_name, + const char *class_name, + const char *method_name, const char *data, int len, u64 *ver) { struct ceph_osd_req_op *ops; - int cls_len = strlen(cls); - int method_len = strlen(method); - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, - cls_len + method_len + len); - if (ret < 0) - return ret; + int class_name_len = strlen(class_name); + int method_name_len = strlen(method_name); + int ret; - ops[0].cls.class_name = cls; - ops[0].cls.class_len = (__u8)cls_len; - ops[0].cls.method_name = method; - ops[0].cls.method_len = (__u8)method_len; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, + class_name_len + method_name_len + len); + if (!ops) + return -ENOMEM; + + ops[0].cls.class_name = class_name; + ops[0].cls.class_len = (__u8) class_name_len; + ops[0].cls.method_name = method_name; + ops[0].cls.method_len = (__u8) method_name_len; ops[0].cls.argc = 0; ops[0].cls.indata = data; ops[0].cls.indata_len = len; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, ver); + object_name, 0, 0, NULL, NULL, ver); rbd_destroy_ops(ops); @@ -1437,10 +1440,12 @@ static void rbd_rq_fn(struct request_queue *q) struct bio *bio; struct bio *rq_bio, *next_bio = NULL; bool do_write; - int size, op_size = 0; + unsigned int size; + u64 op_size = 0; u64 ofs; int num_segs, cur_seg = 0; struct rbd_req_coll *coll; + struct ceph_snap_context *snapc; /* peek at request from block layer */ if (!rq) @@ -1467,23 +1472,38 @@ static void rbd_rq_fn(struct request_queue *q) spin_unlock_irq(q->queue_lock); + down_read(&rbd_dev->header_rwsem); + + if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { + up_read(&rbd_dev->header_rwsem); + dout("request for non-existent snapshot"); + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENXIO); + continue; + } + + snapc = ceph_get_snap_context(rbd_dev->header.snapc); + + up_read(&rbd_dev->header_rwsem); + dout("%s 0x%x bytes at 0x%llx\n", do_write ? "write" : "read", - size, blk_rq_pos(rq) * SECTOR_SIZE); + size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); coll = rbd_alloc_coll(num_segs); if (!coll) { spin_lock_irq(q->queue_lock); __blk_end_request_all(rq, -ENOMEM); + ceph_put_snap_context(snapc); continue; } do { /* a bio clone to be passed down to OSD req */ - dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); + dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); op_size = rbd_get_segment(&rbd_dev->header, - rbd_dev->header.block_name, + rbd_dev->header.object_prefix, ofs, size, NULL, NULL); kref_get(&coll->kref); @@ -1499,7 +1519,7 @@ static void rbd_rq_fn(struct request_queue *q) /* init OSD command: write or read */ if (do_write) rbd_req_write(rq, rbd_dev, - rbd_dev->header.snapc, + snapc, ofs, op_size, bio, coll, cur_seg); @@ -1522,6 +1542,8 @@ next_seg: if (bp) bio_pair_release(bp); spin_lock_irq(q->queue_lock); + + ceph_put_snap_context(snapc); } } @@ -1592,18 +1614,19 @@ static int rbd_read_header(struct rbd_device *rbd_dev, return -ENOMEM; rc = rbd_req_sync_read(rbd_dev, - NULL, CEPH_NOSNAP, - rbd_dev->obj_md_name, + CEPH_NOSNAP, + rbd_dev->header_name, 0, len, (char *)dh, &ver); if (rc < 0) goto out_dh; - rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); + rc = rbd_header_from_disk(header, dh, snap_count); if (rc < 0) { if (rc == -ENXIO) pr_warning("unrecognized header format" - " for image %s", rbd_dev->obj); + " for image %s\n", + rbd_dev->image_name); goto out_dh; } @@ -1628,7 +1651,7 @@ out_dh: /* * create a snapshot */ -static int rbd_header_add_snap(struct rbd_device *dev, +static int rbd_header_add_snap(struct rbd_device *rbd_dev, const char *snap_name, gfp_t gfp_flags) { @@ -1636,16 +1659,15 @@ static int rbd_header_add_snap(struct rbd_device *dev, u64 new_snapid; int ret; void *data, *p, *e; - u64 ver; struct ceph_mon_client *monc; /* we should create a snapshot only if we're pointing at the head */ - if (dev->snap_id != CEPH_NOSNAP) + if (rbd_dev->snap_id != CEPH_NOSNAP) return -EINVAL; - monc = &dev->rbd_client->client->monc; - ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid); - dout("created snapid=%lld\n", new_snapid); + monc = &rbd_dev->rbd_client->client->monc; + ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid); + dout("created snapid=%llu\n", (unsigned long long) new_snapid); if (ret < 0) return ret; @@ -1659,19 +1681,13 @@ static int rbd_header_add_snap(struct rbd_device *dev, ceph_encode_string_safe(&p, e, snap_name, name_len, bad); ceph_encode_64_safe(&p, e, new_snapid, bad); - ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", - data, p - data, &ver); + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "snap_add", + data, p - data, NULL); kfree(data); - if (ret < 0) - return ret; - - down_write(&dev->header_rwsem); - dev->header.snapc->seq = new_snapid; - up_write(&dev->header_rwsem); - - return 0; + return ret < 0 ? ret : 0; bad: return -ERANGE; } @@ -1679,52 +1695,52 @@ bad: static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) { struct rbd_snap *snap; + struct rbd_snap *next; - while (!list_empty(&rbd_dev->snaps)) { - snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node); - __rbd_remove_snap_dev(rbd_dev, snap); - } + list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) + __rbd_remove_snap_dev(snap); } /* * only read the first part of the ondisk header, without the snaps info */ -static int __rbd_refresh_header(struct rbd_device *rbd_dev) +static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) { int ret; struct rbd_image_header h; - u64 snap_seq; - int follow_seq = 0; ret = rbd_read_header(rbd_dev, &h); if (ret < 0) return ret; - /* resized? */ - set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE); - down_write(&rbd_dev->header_rwsem); - snap_seq = rbd_dev->header.snapc->seq; - if (rbd_dev->header.total_snaps && - rbd_dev->header.snapc->snaps[0] == snap_seq) - /* pointing at the head, will need to follow that - if head moves */ - follow_seq = 1; + /* resized? */ + if (rbd_dev->snap_id == CEPH_NOSNAP) { + sector_t size = (sector_t) h.image_size / SECTOR_SIZE; - kfree(rbd_dev->header.snapc); - kfree(rbd_dev->header.snap_names); + dout("setting size to %llu sectors", (unsigned long long) size); + set_capacity(rbd_dev->disk, size); + } + + /* rbd_dev->header.object_prefix shouldn't change */ kfree(rbd_dev->header.snap_sizes); + kfree(rbd_dev->header.snap_names); + /* osd requests may still refer to snapc */ + ceph_put_snap_context(rbd_dev->header.snapc); + if (hver) + *hver = h.obj_version; + rbd_dev->header.obj_version = h.obj_version; + rbd_dev->header.image_size = h.image_size; rbd_dev->header.total_snaps = h.total_snaps; rbd_dev->header.snapc = h.snapc; rbd_dev->header.snap_names = h.snap_names; rbd_dev->header.snap_names_len = h.snap_names_len; rbd_dev->header.snap_sizes = h.snap_sizes; - if (follow_seq) - rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; - else - rbd_dev->header.snapc->seq = snap_seq; + /* Free the extra copy of the object prefix */ + WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); + kfree(h.object_prefix); ret = __rbd_init_snaps_header(rbd_dev); @@ -1733,6 +1749,17 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev) return ret; } +static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) +{ + int ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + ret = __rbd_refresh_header(rbd_dev, hver); + mutex_unlock(&ctl_mutex); + + return ret; +} + static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; @@ -1762,7 +1789,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) goto out; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", - rbd_dev->id); + rbd_dev->dev_id); disk->major = rbd_dev->major; disk->first_minor = 0; disk->fops = &rbd_bd_ops; @@ -1819,8 +1846,13 @@ static ssize_t rbd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + sector_t size; + + down_read(&rbd_dev->header_rwsem); + size = get_capacity(rbd_dev->disk); + up_read(&rbd_dev->header_rwsem); - return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); + return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); } static ssize_t rbd_major_show(struct device *dev, @@ -1848,12 +1880,20 @@ static ssize_t rbd_pool_show(struct device *dev, return sprintf(buf, "%s\n", rbd_dev->pool_name); } +static ssize_t rbd_pool_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%d\n", rbd_dev->pool_id); +} + static ssize_t rbd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->obj); + return sprintf(buf, "%s\n", rbd_dev->image_name); } static ssize_t rbd_snap_show(struct device *dev, @@ -1871,23 +1911,18 @@ static ssize_t rbd_image_refresh(struct device *dev, size_t size) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - int rc; - int ret = size; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + int ret; - rc = __rbd_refresh_header(rbd_dev); - if (rc < 0) - ret = rc; + ret = rbd_refresh_header(rbd_dev, NULL); - mutex_unlock(&ctl_mutex); - return ret; + return ret < 0 ? ret : size; } static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); +static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); @@ -1898,6 +1933,7 @@ static struct attribute *rbd_attrs[] = { &dev_attr_major.attr, &dev_attr_client_id.attr, &dev_attr_pool.attr, + &dev_attr_pool_id.attr, &dev_attr_name.attr, &dev_attr_current_snap.attr, &dev_attr_refresh.attr, @@ -1977,15 +2013,13 @@ static struct device_type rbd_snap_device_type = { .release = rbd_snap_dev_release, }; -static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap) +static void __rbd_remove_snap_dev(struct rbd_snap *snap) { list_del(&snap->node); device_unregister(&snap->dev); } -static int rbd_register_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap, +static int rbd_register_snap_dev(struct rbd_snap *snap, struct device *parent) { struct device *dev = &snap->dev; @@ -2000,29 +2034,36 @@ static int rbd_register_snap_dev(struct rbd_device *rbd_dev, return ret; } -static int __rbd_add_snap_dev(struct rbd_device *rbd_dev, - int i, const char *name, - struct rbd_snap **snapp) +static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, + int i, const char *name) { + struct rbd_snap *snap; int ret; - struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL); + + snap = kzalloc(sizeof (*snap), GFP_KERNEL); if (!snap) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + + ret = -ENOMEM; snap->name = kstrdup(name, GFP_KERNEL); + if (!snap->name) + goto err; + snap->size = rbd_dev->header.snap_sizes[i]; snap->id = rbd_dev->header.snapc->snaps[i]; if (device_is_registered(&rbd_dev->dev)) { - ret = rbd_register_snap_dev(rbd_dev, snap, - &rbd_dev->dev); + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); if (ret < 0) goto err; } - *snapp = snap; - return 0; + + return snap; + err: kfree(snap->name); kfree(snap); - return ret; + + return ERR_PTR(ret); } /* @@ -2055,7 +2096,6 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) const char *name, *first_name; int i = rbd_dev->header.total_snaps; struct rbd_snap *snap, *old_snap = NULL; - int ret; struct list_head *p, *n; first_name = rbd_dev->header.snap_names; @@ -2070,8 +2110,15 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) cur_id = rbd_dev->header.snapc->snaps[i - 1]; if (!i || old_snap->id < cur_id) { - /* old_snap->id was skipped, thus was removed */ - __rbd_remove_snap_dev(rbd_dev, old_snap); + /* + * old_snap->id was skipped, thus was + * removed. If this rbd_dev is mapped to + * the removed snapshot, record that it no + * longer exists, to prevent further I/O. + */ + if (rbd_dev->snap_id == old_snap->id) + rbd_dev->snap_exists = false; + __rbd_remove_snap_dev(old_snap); continue; } if (old_snap->id == cur_id) { @@ -2091,9 +2138,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) if (cur_id >= old_snap->id) break; /* a new snapshot */ - ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); - if (ret < 0) - return ret; + snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); + if (IS_ERR(snap)) + return PTR_ERR(snap); /* note that we add it backward so using n and not p */ list_add(&snap->node, n); @@ -2107,9 +2154,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) WARN_ON(1); return -EINVAL; } - ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); - if (ret < 0) - return ret; + snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); + if (IS_ERR(snap)) + return PTR_ERR(snap); list_add(&snap->node, &rbd_dev->snaps); } @@ -2129,14 +2176,13 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev->type = &rbd_device_type; dev->parent = &rbd_root_dev; dev->release = rbd_dev_release; - dev_set_name(dev, "%d", rbd_dev->id); + dev_set_name(dev, "%d", rbd_dev->dev_id); ret = device_register(dev); if (ret < 0) goto out; list_for_each_entry(snap, &rbd_dev->snaps, node) { - ret = rbd_register_snap_dev(rbd_dev, snap, - &rbd_dev->dev); + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); if (ret < 0) break; } @@ -2155,12 +2201,9 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) int ret, rc; do { - ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, - rbd_dev->header.obj_version); + ret = rbd_req_sync_watch(rbd_dev); if (ret == -ERANGE) { - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rc = __rbd_refresh_header(rbd_dev); - mutex_unlock(&ctl_mutex); + rc = rbd_refresh_header(rbd_dev, NULL); if (rc < 0) return rc; } @@ -2177,7 +2220,7 @@ static atomic64_t rbd_id_max = ATOMIC64_INIT(0); */ static void rbd_id_get(struct rbd_device *rbd_dev) { - rbd_dev->id = atomic64_inc_return(&rbd_id_max); + rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max); spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); @@ -2191,7 +2234,7 @@ static void rbd_id_get(struct rbd_device *rbd_dev) static void rbd_id_put(struct rbd_device *rbd_dev) { struct list_head *tmp; - int rbd_id = rbd_dev->id; + int rbd_id = rbd_dev->dev_id; int max_id; BUG_ON(rbd_id < 1); @@ -2282,19 +2325,58 @@ static inline size_t copy_token(const char **buf, } /* - * This fills in the pool_name, obj, obj_len, snap_name, obj_len, + * Finds the next token in *buf, dynamically allocates a buffer big + * enough to hold a copy of it, and copies the token into the new + * buffer. The copy is guaranteed to be terminated with '\0'. Note + * that a duplicate buffer is created even for a zero-length token. + * + * Returns a pointer to the newly-allocated duplicate, or a null + * pointer if memory for the duplicate was not available. If + * the lenp argument is a non-null pointer, the length of the token + * (not including the '\0') is returned in *lenp. + * + * If successful, the *buf pointer will be updated to point beyond + * the end of the found token. + * + * Note: uses GFP_KERNEL for allocation. + */ +static inline char *dup_token(const char **buf, size_t *lenp) +{ + char *dup; + size_t len; + + len = next_token(buf); + dup = kmalloc(len + 1, GFP_KERNEL); + if (!dup) + return NULL; + + memcpy(dup, *buf, len); + *(dup + len) = '\0'; + *buf += len; + + if (lenp) + *lenp = len; + + return dup; +} + +/* + * This fills in the pool_name, image_name, image_name_len, snap_name, * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based * on the list of monitor addresses and other options provided via * /sys/bus/rbd/add. + * + * Note: rbd_dev is assumed to have been initially zero-filled. */ static int rbd_add_parse_args(struct rbd_device *rbd_dev, const char *buf, const char **mon_addrs, size_t *mon_addrs_size, char *options, - size_t options_size) + size_t options_size) { - size_t len; + size_t len; + int ret; /* The first four tokens are required */ @@ -2310,56 +2392,74 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, if (!len || len >= options_size) return -EINVAL; - len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name)); - if (!len || len >= sizeof (rbd_dev->pool_name)) - return -EINVAL; - - len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj)); - if (!len || len >= sizeof (rbd_dev->obj)) - return -EINVAL; + ret = -ENOMEM; + rbd_dev->pool_name = dup_token(&buf, NULL); + if (!rbd_dev->pool_name) + goto out_err; - /* We have the object length in hand, save it. */ + rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); + if (!rbd_dev->image_name) + goto out_err; - rbd_dev->obj_len = len; + /* Create the name of the header object */ - BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN - < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX)); - sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX); + rbd_dev->header_name = kmalloc(rbd_dev->image_name_len + + sizeof (RBD_SUFFIX), + GFP_KERNEL); + if (!rbd_dev->header_name) + goto out_err; + sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); /* - * The snapshot name is optional, but it's an error if it's - * too long. If no snapshot is supplied, fill in the default. + * The snapshot name is optional. If none is is supplied, + * we use the default value. */ - len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name)); - if (!len) + rbd_dev->snap_name = dup_token(&buf, &len); + if (!rbd_dev->snap_name) + goto out_err; + if (!len) { + /* Replace the empty name with the default */ + kfree(rbd_dev->snap_name); + rbd_dev->snap_name + = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL); + if (!rbd_dev->snap_name) + goto out_err; + memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME)); - else if (len >= sizeof (rbd_dev->snap_name)) - return -EINVAL; + } return 0; + +out_err: + kfree(rbd_dev->header_name); + kfree(rbd_dev->image_name); + kfree(rbd_dev->pool_name); + rbd_dev->pool_name = NULL; + + return ret; } static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) { - struct rbd_device *rbd_dev; + char *options; + struct rbd_device *rbd_dev = NULL; const char *mon_addrs = NULL; size_t mon_addrs_size = 0; - char *options = NULL; struct ceph_osd_client *osdc; int rc = -ENOMEM; if (!try_module_get(THIS_MODULE)) return -ENODEV; - rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); - if (!rbd_dev) - goto err_nomem; options = kmalloc(count, GFP_KERNEL); if (!options) goto err_nomem; + rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); + if (!rbd_dev) + goto err_nomem; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); @@ -2367,15 +2467,13 @@ static ssize_t rbd_add(struct bus_type *bus, INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); - init_rwsem(&rbd_dev->header_rwsem); - /* generate unique id: find highest unique id, add one */ rbd_id_get(rbd_dev); /* Fill in the device name, now that we have its id. */ BUILD_BUG_ON(DEV_NAME_LEN < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); - sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id); + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); /* parse add command */ rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, @@ -2395,7 +2493,7 @@ static ssize_t rbd_add(struct bus_type *bus, rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); if (rc < 0) goto err_out_client; - rbd_dev->poolid = rc; + rbd_dev->pool_id = rc; /* register our block device */ rc = register_blkdev(0, rbd_dev->name); @@ -2435,10 +2533,16 @@ err_out_blkdev: err_out_client: rbd_put_client(rbd_dev); err_put_id: + if (rbd_dev->pool_name) { + kfree(rbd_dev->snap_name); + kfree(rbd_dev->header_name); + kfree(rbd_dev->image_name); + kfree(rbd_dev->pool_name); + } rbd_id_put(rbd_dev); err_nomem: - kfree(options); kfree(rbd_dev); + kfree(options); dout("Error adding device %s\n", buf); module_put(THIS_MODULE); @@ -2446,7 +2550,7 @@ err_nomem: return (ssize_t) rc; } -static struct rbd_device *__rbd_get_dev(unsigned long id) +static struct rbd_device *__rbd_get_dev(unsigned long dev_id) { struct list_head *tmp; struct rbd_device *rbd_dev; @@ -2454,7 +2558,7 @@ static struct rbd_device *__rbd_get_dev(unsigned long id) spin_lock(&rbd_dev_list_lock); list_for_each(tmp, &rbd_dev_list) { rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id == id) { + if (rbd_dev->dev_id == dev_id) { spin_unlock(&rbd_dev_list_lock); return rbd_dev; } @@ -2474,7 +2578,7 @@ static void rbd_dev_release(struct device *dev) rbd_dev->watch_request); } if (rbd_dev->watch_event) - rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); + rbd_req_sync_unwatch(rbd_dev); rbd_put_client(rbd_dev); @@ -2483,6 +2587,10 @@ static void rbd_dev_release(struct device *dev) unregister_blkdev(rbd_dev->major, rbd_dev->name); /* done with the id, and with the rbd_dev */ + kfree(rbd_dev->snap_name); + kfree(rbd_dev->header_name); + kfree(rbd_dev->pool_name); + kfree(rbd_dev->image_name); rbd_id_put(rbd_dev); kfree(rbd_dev); @@ -2544,7 +2652,7 @@ static ssize_t rbd_snap_add(struct device *dev, if (ret < 0) goto err_unlock; - ret = __rbd_refresh_header(rbd_dev); + ret = __rbd_refresh_header(rbd_dev, NULL); if (ret < 0) goto err_unlock; @@ -2553,7 +2661,7 @@ static ssize_t rbd_snap_add(struct device *dev, mutex_unlock(&ctl_mutex); /* make a best effort, don't error if failed */ - rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); + rbd_req_sync_notify(rbd_dev); ret = count; kfree(name); diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index 950708688f17..0924e9e41a60 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -31,7 +31,6 @@ #define RBD_MIN_OBJ_ORDER 16 #define RBD_MAX_OBJ_ORDER 30 -#define RBD_MAX_OBJ_NAME_LEN 96 #define RBD_MAX_SEG_NAME_LEN 128 #define RBD_COMP_NONE 0 diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 00894ff9246c..f391f1e75414 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -51,8 +51,7 @@ int ceph_init_dentry(struct dentry *dentry) goto out_unlock; } - if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ - ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) d_set_d_op(dentry, &ceph_dentry_ops); else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) d_set_d_op(dentry, &ceph_snapdir_dentry_ops); @@ -79,7 +78,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) return NULL; spin_lock(&dentry->d_lock); - if (dentry->d_parent) { + if (!IS_ROOT(dentry)) { inode = dentry->d_parent->d_inode; ihold(inode); } @@ -1154,7 +1153,7 @@ static void ceph_d_prune(struct dentry *dentry) dout("ceph_d_prune %p\n", dentry); /* do we have a valid parent? */ - if (!dentry->d_parent || IS_ROOT(dentry)) + if (IS_ROOT(dentry)) return; /* if we are not hashed, we don't affect D_COMPLETE */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 200bc87eceb1..a5a735422aa7 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -10,6 +10,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/ceph/ceph_features.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> @@ -394,11 +395,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_seq = 0; mutex_init(&s->s_mutex); - ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); - s->s_con.private = s; - s->s_con.ops = &mds_con_ops; - s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; - s->s_con.peer_name.num = cpu_to_le64(mds); + ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; @@ -440,7 +437,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->sessions[mds] = s; atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ - ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); return s; @@ -1472,11 +1470,6 @@ retry: else len += 1 + temp->d_name.len; temp = temp->d_parent; - if (temp == NULL) { - rcu_read_unlock(); - pr_err("build_path corrupt dentry %p\n", dentry); - return ERR_PTR(-EINVAL); - } } rcu_read_unlock(); if (len) @@ -1513,12 +1506,6 @@ retry: if (pos) path[--pos] = '/'; temp = temp->d_parent; - if (temp == NULL) { - rcu_read_unlock(); - pr_err("build_path corrupt dentry\n"); - kfree(path); - return ERR_PTR(-EINVAL); - } } rcu_read_unlock(); if (pos != 0 || read_seqretry(&rename_lock, seq)) { @@ -2531,7 +2518,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; + ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, + CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); /* replay unsafe requests */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e5206fc76562..cbb2f54a3019 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -296,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; int err = 0; - int i; - int num = realm->num_prior_parent_snaps + realm->num_snaps; + u32 num = realm->num_prior_parent_snaps + realm->num_snaps; /* * build parent context, if it hasn't been built. @@ -321,11 +320,11 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" " (unchanged)\n", realm->ino, realm, realm->cached_context, realm->cached_context->seq, - realm->cached_context->num_snaps); + (unsigned int) realm->cached_context->num_snaps); return 0; } @@ -342,6 +341,8 @@ static int build_snap_context(struct ceph_snap_realm *realm) num = 0; snapc->seq = realm->seq; if (parent) { + u32 i; + /* include any of parent's snaps occurring _after_ my parent became my parent */ for (i = 0; i < parent->cached_context->num_snaps; i++) @@ -361,8 +362,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n", - realm->ino, realm, snapc, snapc->seq, snapc->num_snaps); + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", + realm->ino, realm, snapc, snapc->seq, + (unsigned int) snapc->num_snaps); if (realm->cached_context) ceph_put_snap_context(realm->cached_context); @@ -402,9 +404,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm) * helper to allocate and decode an array of snapids. free prior * instance, if any. */ -static int dup_array(u64 **dst, __le64 *src, int num) +static int dup_array(u64 **dst, __le64 *src, u32 num) { - int i; + u32 i; kfree(*dst); if (num) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7076109f014d..b982239f38f9 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -18,6 +18,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/auth.h> diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f4d5522cb619..ebc95cc652be 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -612,9 +612,9 @@ struct ceph_snap_realm { u64 parent_since; /* snapid when our current parent became so */ u64 *prior_parent_snaps; /* snaps inherited from any parents we */ - int num_prior_parent_snaps; /* had prior to parent_since */ + u32 num_prior_parent_snaps; /* had prior to parent_since */ u64 *snaps; /* snaps specific to this realm */ - int num_snaps; + u32 num_snaps; struct ceph_snap_realm *parent; struct list_head children; /* list of child realms */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 785cb3057c95..2c2ae5be9902 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -457,6 +457,7 @@ start: for (i = 0; i < numattr; i++) kfree(xattrs[i]); kfree(xattrs); + xattrs = NULL; goto start; } err = -EIO; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8f660dd6137a..50d0b78130a1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -628,8 +628,8 @@ static long writeback_sb_inodes(struct super_block *sb, } /* - * Don't bother with new inodes or inodes beeing freed, first - * kind does not need peridic writeout yet, and for the latter + * Don't bother with new inodes or inodes being freed, first + * kind does not need periodic writeout yet, and for the latter * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c index 183cc1f0af1c..6d1ee7204c88 100644 --- a/fs/lockd/grace.c +++ b/fs/lockd/grace.c @@ -4,8 +4,10 @@ #include <linux/module.h> #include <linux/lockd/bind.h> +#include <net/net_namespace.h> + +#include "netns.h" -static LIST_HEAD(grace_list); static DEFINE_SPINLOCK(grace_lock); /** @@ -19,10 +21,12 @@ static DEFINE_SPINLOCK(grace_lock); * * This function is called to start a grace period. */ -void locks_start_grace(struct lock_manager *lm) +void locks_start_grace(struct net *net, struct lock_manager *lm) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + spin_lock(&grace_lock); - list_add(&lm->list, &grace_list); + list_add(&lm->list, &ln->grace_list); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); @@ -52,8 +56,10 @@ EXPORT_SYMBOL_GPL(locks_end_grace); * to answer ordinary lock requests, and when they should accept only * lock reclaims. */ -int locks_in_grace(void) +int locks_in_grace(struct net *net) { - return !list_empty(&grace_list); + struct lockd_net *ln = net_generic(net, lockd_net_id); + + return !list_empty(&ln->grace_list); } EXPORT_SYMBOL_GPL(locks_in_grace); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index eb75ca7c2d6e..f9b22e58f78f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -21,6 +21,8 @@ #include <net/ipv6.h> +#include "netns.h" + #define NLMDBG_FACILITY NLMDBG_HOSTCACHE #define NLM_HOST_NRHASH 32 #define NLM_HOST_REBIND (60 * HZ) @@ -41,11 +43,10 @@ static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH]; hlist_for_each_entry_safe((host), (pos), (next), \ (chain), h_hash) -static unsigned long next_gc; static unsigned long nrhosts; static DEFINE_MUTEX(nlm_host_mutex); -static void nlm_gc_hosts(void); +static void nlm_gc_hosts(struct net *net); struct nlm_lookup_host_info { const int server; /* search for server|client */ @@ -172,6 +173,7 @@ out: static void nlm_destroy_host_locked(struct nlm_host *host) { struct rpc_clnt *clnt; + struct lockd_net *ln = net_generic(host->net, lockd_net_id); dprintk("lockd: destroy host %s\n", host->h_name); @@ -188,6 +190,7 @@ static void nlm_destroy_host_locked(struct nlm_host *host) rpc_shutdown_client(clnt); kfree(host); + ln->nrhosts--; nrhosts--; } @@ -228,6 +231,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; + struct lockd_net *ln = net_generic(net, lockd_net_id); dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, (hostname ? hostname : "<none>"), version, @@ -262,6 +266,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, goto out; hlist_add_head(&host->h_hash, chain); + ln->nrhosts++; nrhosts++; dprintk("lockd: %s created host %s (%s)\n", __func__, @@ -326,7 +331,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, struct nsm_handle *nsm = NULL; struct sockaddr *src_sap = svc_daddr(rqstp); size_t src_len = rqstp->rq_daddrlen; - struct net *net = rqstp->rq_xprt->xpt_net; + struct net *net = SVC_NET(rqstp); struct nlm_lookup_host_info ni = { .server = 1, .sap = svc_addr(rqstp), @@ -337,6 +342,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, .hostname_len = hostname_len, .net = net, }; + struct lockd_net *ln = net_generic(net, lockd_net_id); dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, (int)hostname_len, hostname, rqstp->rq_vers, @@ -344,8 +350,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, mutex_lock(&nlm_host_mutex); - if (time_after_eq(jiffies, next_gc)) - nlm_gc_hosts(); + if (time_after_eq(jiffies, ln->next_gc)) + nlm_gc_hosts(net); chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { @@ -382,6 +388,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, memcpy(nlm_srcaddr(host), src_sap, src_len); host->h_srcaddrlen = src_len; hlist_add_head(&host->h_hash, chain); + ln->nrhosts++; nrhosts++; dprintk("lockd: %s created host %s (%s)\n", @@ -565,6 +572,35 @@ void nlm_host_rebooted(const struct nlm_reboot *info) nsm_release(nsm); } +static void nlm_complain_hosts(struct net *net) +{ + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host; + + if (net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + if (ln->nrhosts == 0) + return; + printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net); + dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net); + } else { + if (nrhosts == 0) + return; + printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); + dprintk("lockd: %lu hosts left:\n", nrhosts); + } + + for_each_host(host, pos, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; + dprintk(" %s (cnt %d use %d exp %ld net %p)\n", + host->h_name, atomic_read(&host->h_count), + host->h_inuse, host->h_expires, host->net); + } +} + void nlm_shutdown_hosts_net(struct net *net) { @@ -572,11 +608,10 @@ nlm_shutdown_hosts_net(struct net *net) struct hlist_node *pos; struct nlm_host *host; - dprintk("lockd: shutting down host module\n"); mutex_lock(&nlm_host_mutex); /* First, make all hosts eligible for gc */ - dprintk("lockd: nuking all hosts...\n"); + dprintk("lockd: nuking all hosts in net %p...\n", net); for_each_host(host, pos, chain, nlm_server_hosts) { if (net && host->net != net) continue; @@ -588,8 +623,10 @@ nlm_shutdown_hosts_net(struct net *net) } /* Then, perform a garbage collection pass */ - nlm_gc_hosts(); + nlm_gc_hosts(net); mutex_unlock(&nlm_host_mutex); + + nlm_complain_hosts(net); } /* @@ -599,22 +636,8 @@ nlm_shutdown_hosts_net(struct net *net) void nlm_shutdown_hosts(void) { - struct hlist_head *chain; - struct hlist_node *pos; - struct nlm_host *host; - + dprintk("lockd: shutting down host module\n"); nlm_shutdown_hosts_net(NULL); - - /* complain if any hosts are left */ - if (nrhosts != 0) { - printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); - dprintk("lockd: %lu hosts left:\n", nrhosts); - for_each_host(host, pos, chain, nlm_server_hosts) { - dprintk(" %s (cnt %d use %d exp %ld net %p)\n", - host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires, host->net); - } - } } /* @@ -623,30 +646,39 @@ nlm_shutdown_hosts(void) * mark & sweep for resources held by remote clients. */ static void -nlm_gc_hosts(void) +nlm_gc_hosts(struct net *net) { struct hlist_head *chain; struct hlist_node *pos, *next; struct nlm_host *host; - dprintk("lockd: host garbage collection\n"); - for_each_host(host, pos, chain, nlm_server_hosts) + dprintk("lockd: host garbage collection for net %p\n", net); + for_each_host(host, pos, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; host->h_inuse = 0; + } /* Mark all hosts that hold locks, blocks or shares */ - nlmsvc_mark_resources(); + nlmsvc_mark_resources(net); for_each_host_safe(host, pos, next, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; if (atomic_read(&host->h_count) || host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " - "(cnt %d use %d exp %ld)\n", + "(cnt %d use %d exp %ld net %p)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires); + host->h_inuse, host->h_expires, host->net); continue; } nlm_destroy_host_locked(host); } - next_gc = jiffies + NLM_HOST_COLLECT; + if (net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + ln->next_gc = jiffies + NLM_HOST_COLLECT; + } } diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index ce227e0fbc5c..4eee248ba96e 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -1,10 +1,17 @@ #ifndef __LOCKD_NETNS_H__ #define __LOCKD_NETNS_H__ +#include <linux/fs.h> #include <net/netns/generic.h> struct lockd_net { unsigned int nlmsvc_users; + unsigned long next_gc; + unsigned long nrhosts; + + struct delayed_work grace_period_end; + struct lock_manager lockd_manager; + struct list_head grace_list; }; extern int lockd_net_id; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 80938fda67e0..31a63f87b806 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -87,32 +87,36 @@ static unsigned long get_lockd_grace_period(void) return nlm_timeout * 5 * HZ; } -static struct lock_manager lockd_manager = { -}; - -static void grace_ender(struct work_struct *not_used) +static void grace_ender(struct work_struct *grace) { - locks_end_grace(&lockd_manager); -} + struct delayed_work *dwork = container_of(grace, struct delayed_work, + work); + struct lockd_net *ln = container_of(dwork, struct lockd_net, + grace_period_end); -static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); + locks_end_grace(&ln->lockd_manager); +} -static void set_grace_period(void) +static void set_grace_period(struct net *net) { unsigned long grace_period = get_lockd_grace_period(); + struct lockd_net *ln = net_generic(net, lockd_net_id); - locks_start_grace(&lockd_manager); - cancel_delayed_work_sync(&grace_period_end); - schedule_delayed_work(&grace_period_end, grace_period); + locks_start_grace(net, &ln->lockd_manager); + cancel_delayed_work_sync(&ln->grace_period_end); + schedule_delayed_work(&ln->grace_period_end, grace_period); } static void restart_grace(void) { if (nlmsvc_ops) { - cancel_delayed_work_sync(&grace_period_end); - locks_end_grace(&lockd_manager); + struct net *net = &init_net; + struct lockd_net *ln = net_generic(net, lockd_net_id); + + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); nlmsvc_invalidate_all(); - set_grace_period(); + set_grace_period(net); } } @@ -137,8 +141,6 @@ lockd(void *vrqstp) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - set_grace_period(); - /* * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. @@ -184,8 +186,6 @@ lockd(void *vrqstp) svc_process(rqstp); } flush_signals(current); - cancel_delayed_work_sync(&grace_period_end); - locks_end_grace(&lockd_manager); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); @@ -266,6 +266,7 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net) error = make_socks(serv, net); if (error < 0) goto err_socks; + set_grace_period(net); dprintk("lockd_up_net: per-net data created; net=%p\n", net); return 0; @@ -283,6 +284,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); svc_shutdown_net(serv, net); dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); } @@ -589,6 +592,10 @@ module_param(nlm_max_connections, uint, 0644); static int lockd_init_net(struct net *net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); + INIT_LIST_HEAD(&ln->grace_list); return 0; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 9a41fdc19511..4a43d253c045 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -11,6 +11,7 @@ #include <linux/time.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> +#include <linux/sunrpc/svc_xprt.h> #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -151,7 +152,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -161,7 +162,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Try to cancel request. */ - resp->status = nlmsvc_cancel_blocked(file, &argp->lock); + resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock); dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); nlmsvc_release_host(host); @@ -184,7 +185,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -194,7 +195,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Now try to remove the lock */ - resp->status = nlmsvc_unlock(file, &argp->lock); + resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock); dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); nlmsvc_release_host(host); @@ -321,7 +322,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (locks_in_grace() && !argp->reclaim) { + if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -354,7 +355,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e46353f41a42..afe4488c33d8 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -26,7 +26,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> #include <linux/kthread.h> @@ -447,11 +447,11 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - if (locks_in_grace() && !reclaim) { + if (locks_in_grace(SVC_NET(rqstp)) && !reclaim) { ret = nlm_lck_denied_grace_period; goto out; } - if (reclaim && !locks_in_grace()) { + if (reclaim && !locks_in_grace(SVC_NET(rqstp))) { ret = nlm_lck_denied_grace_period; goto out; } @@ -559,7 +559,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { ret = nlm_lck_denied_grace_period; goto out; } @@ -603,7 +603,7 @@ out: * must be removed. */ __be32 -nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) +nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { int error; @@ -615,7 +615,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_end); /* First, cancel any lock that might be there */ - nlmsvc_cancel_blocked(file, lock); + nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); @@ -631,7 +631,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) * The calling procedure must check whether the file can be closed. */ __be32 -nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) +nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { struct nlm_block *block; int status = 0; @@ -643,7 +643,7 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); - if (locks_in_grace()) + if (locks_in_grace(net)) return nlm_lck_denied_grace_period; mutex_lock(&file->f_mutex); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index d27aab11f324..de8f2caa2235 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -11,6 +11,7 @@ #include <linux/time.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> +#include <linux/sunrpc/svc_xprt.h> #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -175,13 +176,14 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; + struct net *net = SVC_NET(rqstp); dprintk("lockd: CANCEL called\n"); resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(net)) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -191,7 +193,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Try to cancel request. */ - resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock)); + resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock)); dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); nlmsvc_release_host(host); @@ -208,13 +210,14 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; + struct net *net = SVC_NET(rqstp); dprintk("lockd: UNLOCK called\n"); resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(net)) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -224,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; /* Now try to remove the lock */ - resp->status = cast_status(nlmsvc_unlock(file, &argp->lock)); + resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock)); dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); nlmsvc_release_host(host); @@ -361,7 +364,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (locks_in_grace() && !argp->reclaim) { + if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -394,7 +397,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (locks_in_grace()) { + if (locks_in_grace(SVC_NET(rqstp))) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 2240d384d787..0deb5f6c9dd4 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -309,7 +309,8 @@ nlm_release_file(struct nlm_file *file) * Helpers function for resource traversal * * nlmsvc_mark_host: - * used by the garbage collector; simply sets h_inuse. + * used by the garbage collector; simply sets h_inuse only for those + * hosts, which passed network check. * Always returns 0. * * nlmsvc_same_host: @@ -320,12 +321,15 @@ nlm_release_file(struct nlm_file *file) * returns 1 iff the host is a client. * Used by nlmsvc_invalidate_all */ + static int -nlmsvc_mark_host(void *data, struct nlm_host *dummy) +nlmsvc_mark_host(void *data, struct nlm_host *hint) { struct nlm_host *host = data; - host->h_inuse = 1; + if ((hint->net == NULL) || + (host->net == hint->net)) + host->h_inuse = 1; return 0; } @@ -358,10 +362,13 @@ nlmsvc_is_client(void *data, struct nlm_host *dummy) * Mark all hosts that still hold resources */ void -nlmsvc_mark_resources(void) +nlmsvc_mark_resources(struct net *net) { - dprintk("lockd: nlmsvc_mark_resources\n"); - nlm_traverse_files(NULL, nlmsvc_mark_host, NULL); + struct nlm_host hint; + + dprintk("lockd: nlmsvc_mark_resources for net %p\n", net); + hint.net = net; + nlm_traverse_files(&hint, nlmsvc_mark_host, NULL); } /* diff --git a/fs/locks.c b/fs/locks.c index 82c353304f9e..cdcf219a7391 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -427,18 +427,8 @@ static void lease_break_callback(struct file_lock *fl) kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); } -static void lease_release_private_callback(struct file_lock *fl) -{ - if (!fl->fl_file) - return; - - f_delown(fl->fl_file); - fl->fl_file->f_owner.signum = 0; -} - static const struct lock_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, - .lm_release_private = lease_release_private_callback, .lm_change = lease_modify, }; @@ -580,12 +570,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p) fl->fl_next = NULL; list_del_init(&fl->fl_link); - fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync); - if (fl->fl_fasync != NULL) { - printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); - fl->fl_fasync = NULL; - } - if (fl->fl_nspid) { put_pid(fl->fl_nspid); fl->fl_nspid = NULL; @@ -1155,8 +1139,18 @@ int lease_modify(struct file_lock **before, int arg) return error; lease_clear_pending(fl, arg); locks_wake_up_blocks(fl); - if (arg == F_UNLCK) + if (arg == F_UNLCK) { + struct file *filp = fl->fl_file; + + f_delown(filp); + filp->f_owner.signum = 0; + fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync); + if (fl->fl_fasync != NULL) { + printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); + fl->fl_fasync = NULL; + } locks_delete_lock(before); + } return 0; } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index e64b01d2a338..742ff4ffced7 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -863,7 +863,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r .drc_status = 0, .clp = NULL, .slotid = NFS4_NO_SLOT, - .net = rqstp->rq_xprt->xpt_net, + .net = SVC_NET(rqstp), }; unsigned int nops = 0; @@ -879,7 +879,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r return rpc_garbage_args; if (hdr_arg.minorversion == 0) { - cps.clp = nfs4_find_client_ident(rqstp->rq_xprt->xpt_net, hdr_arg.cb_ident); + cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) return rpc_drop_reply; } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ba233499b9a5..a3946cf13fc8 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -398,7 +398,7 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) int migrated, i, err; /* listsize */ - err = get_int(mesg, &fsloc->locations_count); + err = get_uint(mesg, &fsloc->locations_count); if (err) return err; if (fsloc->locations_count > MAX_FS_LOCATIONS) @@ -456,7 +456,7 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { - err = get_int(mesg, &f->pseudoflavor); + err = get_uint(mesg, &f->pseudoflavor); if (err) return err; /* @@ -465,7 +465,7 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) * problem at export time instead of when a client fails * to authenticate. */ - err = get_int(mesg, &f->flags); + err = get_uint(mesg, &f->flags); if (err) return err; /* Only some flags are allowed to differ between flavors: */ @@ -929,7 +929,7 @@ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); - struct nfsd_net *nn = net_generic(rqstp->rq_xprt->xpt_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) @@ -960,7 +960,7 @@ struct svc_export * rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); - struct nfsd_net *nn = net_generic(rqstp->rq_xprt->xpt_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 39365636b244..65c2431ea32f 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -34,6 +34,10 @@ struct nfsd_net { struct cache_detail *idtoname_cache; struct cache_detail *nametoid_cache; + + struct lock_manager nfsd4_manager; + bool grace_ended; + time_t boot_time; }; extern int nfsd_net_id; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index a5fd6b982f27..cbaf4f8bb7b7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -756,7 +756,6 @@ static void do_probe_callback(struct nfs4_client *clp) */ void nfsd4_probe_callback(struct nfs4_client *clp) { - /* XXX: atomicity? Also, should we be using cl_flags? */ clp->cl_cb_state = NFSD4_CB_UNKNOWN; set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); do_probe_callback(clp); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index dae36f1dee95..fdc91a6fc9c4 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -546,7 +546,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen .type = type, }; int ret; - struct nfsd_net *nn = net_generic(rqstp->rq_xprt->xpt_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (namelen + 1 > sizeof(key.name)) return nfserr_badowner; @@ -571,7 +571,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) .type = type, }; int ret; - struct nfsd_net *nn = net_generic(rqstp->rq_xprt->xpt_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 987e719fbae8..c9c1c0a25417 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -354,10 +354,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -686,7 +686,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); /* check stateid */ - if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid, + if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), + cstate, &read->rd_stateid, RD_STATE, &read->rd_filp))) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -741,7 +742,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (locks_in_grace()) + if (locks_in_grace(SVC_NET(rqstp))) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -760,8 +761,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags - & NFSEXP_NOSUBTREECHECK)) + if (locks_in_grace(SVC_NET(rqstp)) && + !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, rename->rn_snamelen, &cstate->current_fh, @@ -845,7 +846,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(cstate, + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, &setattr->sa_stateid, WR_STATE, NULL); nfs4_unlock_state(); if (status) { @@ -890,7 +891,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp); + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), + cstate, stateid, WR_STATE, &filp); if (filp) get_file(filp); nfs4_unlock_state(); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 94effd5bc4a1..cc894eda385a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -38,18 +38,21 @@ #include <linux/namei.h> #include <linux/swap.h> #include <linux/pagemap.h> +#include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/clnt.h> #include "xdr4.h" #include "vfs.h" #include "current_stateid.h" +#include "fault_inject.h" + +#include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; -static time_t boot_time; #define all_ones {{~0,~0},~0} static const stateid_t one_stateid = { @@ -862,6 +865,11 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&conn->cn_xpt_user); + if (ses->se_client->cl_cb_state == NFSD4_CB_DOWN && + dir & NFS4_CDFC4_BACK) { + /* callback channel may be back up */ + nfsd4_probe_callback(ses->se_client); + } return nfs_ok; } @@ -1047,12 +1055,12 @@ renew_client(struct nfs4_client *clp) /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ static int -STALE_CLIENTID(clientid_t *clid) +STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) { - if (clid->cl_boot == boot_time) + if (clid->cl_boot == nn->boot_time) return 0; dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", - clid->cl_boot, clid->cl_id, boot_time); + clid->cl_boot, clid->cl_id, nn->boot_time); return 1; } @@ -1215,7 +1223,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2) return true; } -static int +static bool same_creds(struct svc_cred *cr1, struct svc_cred *cr2) { if ((cr1->cr_flavor != cr2->cr_flavor) @@ -1227,14 +1235,15 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2) return true; if (!cr1->cr_principal || !cr2->cr_principal) return false; - return 0 == strcmp(cr1->cr_principal, cr1->cr_principal); + return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); } static void gen_clid(struct nfs4_client *clp) { static u32 current_clientid = 1; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - clp->cl_clientid.cl_boot = boot_time; + clp->cl_clientid.cl_boot = nn->boot_time; clp->cl_clientid.cl_id = current_clientid++; } @@ -2217,8 +2226,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - if (STALE_CLIENTID(clid)) + if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; nfs4_lock_state(); @@ -2577,8 +2587,9 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, unsigned int strhashval; struct nfs4_openowner *oo = NULL; __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - if (STALE_CLIENTID(&open->op_clientid)) + if (STALE_CLIENTID(&open->op_clientid, nn)) return nfserr_stale_clientid; /* * In case we need it later, after we've already created the @@ -2876,7 +2887,8 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) * Attempt to hand out a delegation. */ static void -nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp) +nfs4_open_delegation(struct net *net, struct svc_fh *fh, + struct nfsd4_open *open, struct nfs4_ol_stateid *stp) { struct nfs4_delegation *dp; struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); @@ -2897,7 +2909,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_ case NFS4_OPEN_CLAIM_NULL: /* Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (locks_in_grace()) + if (locks_in_grace(net)) goto out; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out; @@ -3007,14 +3019,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_get_vfs_file(rqstp, fp, current_fh, open); if (status) goto out; + status = nfsd4_truncate(rqstp, current_fh, open); + if (status) + goto out; stp = open->op_stp; open->op_stp = NULL; init_open_stateid(stp, fp, open); - status = nfsd4_truncate(rqstp, current_fh, open); - if (status) { - release_open_stateid(stp); - goto out; - } } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -3033,7 +3043,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ - nfs4_open_delegation(current_fh, open, stp); + nfs4_open_delegation(SVC_NET(rqstp), current_fh, open, stp); nodeleg: status = nfs_ok; @@ -3087,12 +3097,13 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_client *clp; __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); nfs4_lock_state(); dprintk("process_renew(%08x/%08x): starting\n", clid->cl_boot, clid->cl_id); status = nfserr_stale_clientid; - if (STALE_CLIENTID(clid)) + if (STALE_CLIENTID(clid, nn)) goto out; clp = find_confirmed_client(clid); status = nfserr_expired; @@ -3111,22 +3122,19 @@ out: return status; } -static struct lock_manager nfsd4_manager = { -}; - -static bool grace_ended; - static void -nfsd4_end_grace(void) +nfsd4_end_grace(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + /* do nothing if grace period already ended */ - if (grace_ended) + if (nn->grace_ended) return; dprintk("NFSD: end of grace period\n"); - grace_ended = true; - nfsd4_record_grace_done(&init_net, boot_time); - locks_end_grace(&nfsd4_manager); + nn->grace_ended = true; + nfsd4_record_grace_done(net, nn->boot_time); + locks_end_grace(&nn->nfsd4_manager); /* * Now that every NFSv4 client has had the chance to recover and * to see the (possibly new, possibly shorter) lease time, we @@ -3149,7 +3157,7 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); - nfsd4_end_grace(); + nfsd4_end_grace(&init_net); INIT_LIST_HEAD(&reaplist); spin_lock(&client_lock); list_for_each_safe(pos, next, &client_lru) { @@ -3231,9 +3239,9 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *s } static int -STALE_STATEID(stateid_t *stateid) +STALE_STATEID(stateid_t *stateid, struct nfsd_net *nn) { - if (stateid->si_opaque.so_clid.cl_boot == boot_time) + if (stateid->si_opaque.so_clid.cl_boot == nn->boot_time) return 0; dprintk("NFSD: stale stateid " STATEID_FMT "!\n", STATEID_VAL(stateid)); @@ -3273,11 +3281,11 @@ out: } static inline __be32 -check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) +check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, int flags) { if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (locks_in_grace()) { + else if (locks_in_grace(net)) { /* Answer in remaining cases depends on existence of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -3294,9 +3302,9 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) * that are not able to provide mandatory locking. */ static inline int -grace_disallows_io(struct inode *inode) +grace_disallows_io(struct net *net, struct inode *inode) { - return locks_in_grace() && mandatory_lock(inode); + return locks_in_grace(net) && mandatory_lock(inode); } /* Returns true iff a is later than b: */ @@ -3333,18 +3341,26 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } -__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) +static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; struct nfs4_ol_stateid *ols; __be32 status; - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; - + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; + /* Client debugging aid. */ + if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { + char addr_str[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str, + sizeof(addr_str)); + pr_warn_ratelimited("NFSD: client %s testing state ID " + "with incorrect client ID\n", addr_str); + return nfserr_bad_stateid; + } s = find_stateid(cl, stateid); if (!s) - return nfserr_stale_stateid; + return nfserr_bad_stateid; status = check_stateid_generation(stateid, &s->sc_stateid, 1); if (status) return status; @@ -3360,10 +3376,11 @@ __be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s) { struct nfs4_client *cl; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; - if (STALE_STATEID(stateid)) + if (STALE_STATEID(stateid, nn)) return nfserr_stale_stateid; cl = find_confirmed_client(&stateid->si_opaque.so_clid); if (!cl) @@ -3379,7 +3396,7 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, s * Checks for stateid operations */ __be32 -nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, +nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filpp) { struct nfs4_stid *s; @@ -3392,11 +3409,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (filpp) *filpp = NULL; - if (grace_disallows_io(ino)) + if (grace_disallows_io(net, ino)) return nfserr_grace; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - return check_special_stateids(current_fh, stateid, flags); + return check_special_stateids(net, current_fh, stateid, flags); status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s); if (status) @@ -3463,7 +3480,8 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) - stateid->ts_id_status = nfs4_validate_stateid(cl, &stateid->ts_id_stateid); + stateid->ts_id_status = + nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); nfs4_unlock_state(); return nfs_ok; @@ -3750,12 +3768,19 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_close_open_stateid(stp); oo->oo_last_closed_stid = stp; - /* place unused nfs4_stateowners on so_close_lru list to be - * released by the laundromat service after the lease period - * to enable us to handle CLOSE replay - */ - if (list_empty(&oo->oo_owner.so_stateids)) - move_to_close_lru(oo); + if (list_empty(&oo->oo_owner.so_stateids)) { + if (cstate->minorversion) { + release_openowner(oo); + cstate->replay_owner = NULL; + } else { + /* + * In the 4.0 case we need to keep the owners around a + * little while to handle CLOSE replay. + */ + if (list_empty(&oo->oo_owner.so_stateids)) + move_to_close_lru(oo); + } + } out: if (!cstate->replay_owner) nfs4_unlock_state(); @@ -4027,6 +4052,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, bool new_state = false; int lkflg; int err; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", (long long) lock->lk_offset, @@ -4044,11 +4070,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); if (lock->lk_is_new) { - /* - * Client indicates that this is a new lockowner. - * Use open owner and open stateid to create lock owner and - * lock stateid. - */ struct nfs4_ol_stateid *open_stp = NULL; if (nfsd4_has_session(cstate)) @@ -4058,7 +4079,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, sizeof(clientid_t)); status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lock->lk_new_clientid)) + if (STALE_CLIENTID(&lock->lk_new_clientid, nn)) goto out; /* validate and update open stateid and open seqid */ @@ -4075,17 +4096,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new_state); - if (status) - goto out; - } else { - /* lock (lock owner + lock stateid) already exists */ + } else status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, NFS4_LOCK_STID, &lock_stp); - if (status) - goto out; - } + if (status) + goto out; lock_sop = lockowner(lock_stp->st_stateowner); lkflg = setlkflg(lock->lk_type); @@ -4094,10 +4111,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = nfserr_grace; - if (locks_in_grace() && !lock->lk_reclaim) + if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim) goto out; status = nfserr_no_grace; - if (!locks_in_grace() && lock->lk_reclaim) + if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim) goto out; locks_init_lock(&file_lock); @@ -4196,8 +4213,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file_lock file_lock; struct nfs4_lockowner *lo; __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); - if (locks_in_grace()) + if (locks_in_grace(SVC_NET(rqstp))) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) @@ -4206,7 +4224,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfserr_stale_clientid; - if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) + if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn)) goto out; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) @@ -4355,6 +4373,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct list_head matches; unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); __be32 status; + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -4362,7 +4381,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, /* XXX check for lease expiration */ status = nfserr_stale_clientid; - if (STALE_CLIENTID(clid)) + if (STALE_CLIENTID(clid, nn)) return status; nfs4_lock_state(); @@ -4564,7 +4583,7 @@ void nfsd_forget_openowners(u64 num) printk(KERN_INFO "NFSD: Forgot %d open owners", count); } -int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *)) +int nfsd_process_n_delegations(u64 num, struct list_head *list) { int i, count = 0; struct nfs4_file *fp, *fnext; @@ -4573,7 +4592,7 @@ int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegatio for (i = 0; i < FILE_HASH_SIZE; i++) { list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) { list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) { - deleg_func(dp); + list_move(&dp->dl_recall_lru, list); if (++count == num) return count; } @@ -4586,9 +4605,16 @@ int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegatio void nfsd_forget_delegations(u64 num) { unsigned int count; + LIST_HEAD(victims); + struct nfs4_delegation *dp, *dnext; + + spin_lock(&recall_lock); + count = nfsd_process_n_delegations(num, &victims); + spin_unlock(&recall_lock); nfs4_lock_state(); - count = nfsd_process_n_delegations(num, unhash_delegation); + list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) + unhash_delegation(dp); nfs4_unlock_state(); printk(KERN_INFO "NFSD: Forgot %d delegations", count); @@ -4597,12 +4623,16 @@ void nfsd_forget_delegations(u64 num) void nfsd_recall_delegations(u64 num) { unsigned int count; + LIST_HEAD(victims); + struct nfs4_delegation *dp, *dnext; - nfs4_lock_state(); spin_lock(&recall_lock); - count = nfsd_process_n_delegations(num, nfsd_break_one_deleg); + count = nfsd_process_n_delegations(num, &victims); + list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) { + list_del(&dp->dl_recall_lru); + nfsd_break_one_deleg(dp); + } spin_unlock(&recall_lock); - nfs4_unlock_state(); printk(KERN_INFO "NFSD: Recalled %d delegations", count); } @@ -4665,6 +4695,8 @@ set_max_delegations(void) int nfs4_state_start(void) { + struct net *net = &init_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; /* @@ -4674,11 +4706,11 @@ nfs4_state_start(void) * to that instead and then do most of the rest of this on a per-net * basis. */ - get_net(&init_net); - nfsd4_client_tracking_init(&init_net); - boot_time = get_seconds(); - locks_start_grace(&nfsd4_manager); - grace_ended = false; + get_net(net); + nfsd4_client_tracking_init(net); + nn->boot_time = get_seconds(); + locks_start_grace(net, &nn->nfsd4_manager); + nn->grace_ended = false; printk(KERN_INFO "NFSD: starting %ld-second grace period\n", nfsd4_grace); ret = set_callback_cred(); @@ -4700,8 +4732,8 @@ nfs4_state_start(void) out_free_laundry: destroy_workqueue(laundry_wq); out_recovery: - nfsd4_client_tracking_exit(&init_net); - put_net(&init_net); + nfsd4_client_tracking_exit(net); + put_net(net); return ret; } @@ -4742,9 +4774,12 @@ __nfs4_state_shutdown(void) void nfs4_state_shutdown(void) { + struct net *net = &init_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + cancel_delayed_work_sync(&laundromat_work); destroy_workqueue(laundry_wq); - locks_end_grace(&nfsd4_manager); + locks_end_grace(&nn->nfsd4_manager); nfs4_lock_state(); __nfs4_state_shutdown(); nfs4_unlock_state(); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 4949667c84ea..6322df36031f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2259,7 +2259,7 @@ out_acl: if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { if ((buflen -= 4) < 0) goto out_resource; - WRITE32(1); + WRITE32(0); } if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { if ((buflen -= 4) < 0) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c55298ed5772..fa49cff5ee65 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -673,9 +673,7 @@ static ssize_t __write_ports_addfd(char *buf) err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); if (err < 0) { - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); + nfsd_destroy(net); return err; } @@ -744,9 +742,7 @@ out_close: svc_xprt_put(xprt); } out_err: - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); + nfsd_destroy(net); return err; } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 1671429ffa66..2244222368ab 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -72,6 +72,19 @@ int nfsd_nrthreads(void); int nfsd_nrpools(void); int nfsd_get_nrthreads(int n, int *); int nfsd_set_nrthreads(int n, int *); +int nfsd_pool_stats_open(struct inode *, struct file *); +int nfsd_pool_stats_release(struct inode *, struct file *); + +static inline void nfsd_destroy(struct net *net) +{ + int destroy = (nfsd_serv->sv_nrthreads == 1); + + if (destroy) + svc_shutdown_net(nfsd_serv, net); + svc_destroy(nfsd_serv); + if (destroy) + nfsd_serv = NULL; +} #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) #ifdef CONFIG_NFSD_V2_ACL diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ee709fc8f58b..240473cb708f 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -254,8 +254,6 @@ static void nfsd_shutdown(void) static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { - /* When last nfsd thread exits we need to do some clean-up */ - nfsd_serv = NULL; nfsd_shutdown(); svc_rpcb_cleanup(serv, net); @@ -332,6 +330,7 @@ static int nfsd_get_default_max_blksize(void) int nfsd_create_serv(void) { int error; + struct net *net = current->nsproxy->net_ns; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { @@ -346,7 +345,7 @@ int nfsd_create_serv(void) if (nfsd_serv == NULL) return -ENOMEM; - error = svc_bind(nfsd_serv, current->nsproxy->net_ns); + error = svc_bind(nfsd_serv, net); if (error < 0) { svc_destroy(nfsd_serv); return error; @@ -427,11 +426,7 @@ int nfsd_set_nrthreads(int n, int *nthreads) if (err) break; } - - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); - + nfsd_destroy(net); return err; } @@ -478,9 +473,7 @@ out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown(); out_destroy: - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); /* Release server */ + nfsd_destroy(net); /* Release server */ out: mutex_unlock(&nfsd_mutex); return error; @@ -563,12 +556,13 @@ nfsd(void *vrqstp) nfsdstats.th_cnt --; out: - if (rqstp->rq_server->sv_nrthreads == 1) - svc_shutdown_net(rqstp->rq_server, &init_net); + rqstp->rq_server = NULL; /* Release the thread */ svc_exit_thread(rqstp); + nfsd_destroy(&init_net); + /* Release module */ mutex_unlock(&nfsd_mutex); module_put_and_exit(0); @@ -682,9 +676,7 @@ int nfsd_pool_stats_release(struct inode *inode, struct file *file) mutex_lock(&nfsd_mutex); /* this function really, really should have been called svc_put() */ - if (nfsd_serv->sv_nrthreads == 1) - svc_shutdown_net(nfsd_serv, net); - svc_destroy(nfsd_serv); + nfsd_destroy(net); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 849091e16ea6..e6173147f982 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -450,8 +450,10 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) #define WR_STATE 0x00000020 struct nfsd4_compound_state; +struct nfsd_net; -extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, +extern __be32 nfs4_preprocess_stateid_op(struct net *net, + struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filp); extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); @@ -475,7 +477,6 @@ extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); extern int nfs4_client_to_reclaim(const char *name); extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); extern void release_session_client(struct nfsd4_session *); -extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); /* nfs4recover operations */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4700a0a929d7..702f64e820c3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -757,8 +757,16 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_len that doesn't check permissions. */ - err = fh_verify(rqstp, fhp, type, may_flags | NFSD_MAY_OWNER_OVERRIDE); + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; + err = fh_verify(rqstp, fhp, type, may_flags); if (err) goto out; diff --git a/fs/super.c b/fs/super.c index c743fb3be4b8..4c5d82f56ec4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -320,7 +320,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock) /* * grab_super_passive - acquire a passive reference - * @s: reference we are trying to grab + * @sb: reference we are trying to grab * * Tries to acquire a passive reference. This is used in places where we * cannot take an active reference but we need to ensure that the diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b1038bd686ac..489de625cd25 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -10,7 +10,7 @@ #include <linux/percpu_counter.h> #include <linux/log2.h> -#include <linux/proportions.h> +#include <linux/flex_proportions.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> @@ -89,7 +89,7 @@ struct backing_dev_info { unsigned long dirty_ratelimit; unsigned long balanced_dirty_ratelimit; - struct prop_local_percpu completions; + struct fprop_local_percpu completions; int dirty_exceeded; unsigned int min_ratio; diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h new file mode 100644 index 000000000000..dad579b0c0e6 --- /dev/null +++ b/include/linux/ceph/ceph_features.h @@ -0,0 +1,27 @@ +#ifndef __CEPH_FEATURES +#define __CEPH_FEATURES + +/* + * feature bits + */ +#define CEPH_FEATURE_UID (1<<0) +#define CEPH_FEATURE_NOSRCADDR (1<<1) +#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) +#define CEPH_FEATURE_FLOCK (1<<3) +#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) +#define CEPH_FEATURE_MONNAMES (1<<5) +#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) +#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) +/* bits 8-17 defined by user-space; not supported yet here */ +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) + +/* + * Features supported. + */ +#define CEPH_FEATURES_SUPPORTED_DEFAULT \ + (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_CRUSH_TUNABLES) + +#define CEPH_FEATURES_REQUIRED_DEFAULT \ + (CEPH_FEATURE_NOSRCADDR) +#endif diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index e81ab30d4896..d021610efd65 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -35,20 +35,6 @@ /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ #define CEPH_MAX_MON 31 - -/* - * feature bits - */ -#define CEPH_FEATURE_UID (1<<0) -#define CEPH_FEATURE_NOSRCADDR (1<<1) -#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) -#define CEPH_FEATURE_FLOCK (1<<3) -#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) -#define CEPH_FEATURE_MONNAMES (1<<5) -#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) -#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) - - /* * ceph_file_layout - describe data layout for a file/inode */ diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index d8615dee5808..4bbf2db45f46 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -1,6 +1,7 @@ #ifndef __CEPH_DECODE_H #define __CEPH_DECODE_H +#include <linux/err.h> #include <linux/bug.h> #include <linux/time.h> #include <asm/unaligned.h> @@ -85,6 +86,52 @@ static inline int ceph_has_room(void **p, void *end, size_t n) } while (0) /* + * Allocate a buffer big enough to hold the wire-encoded string, and + * decode the string into it. The resulting string will always be + * terminated with '\0'. If successful, *p will be advanced + * past the decoded data. Also, if lenp is not a null pointer, the + * length (not including the terminating '\0') will be recorded in + * *lenp. Note that a zero-length string is a valid return value. + * + * Returns a pointer to the newly-allocated string buffer, or a + * pointer-coded errno if an error occurs. Neither *p nor *lenp + * will have been updated if an error is returned. + * + * There are two possible failures: + * - converting the string would require accessing memory at or + * beyond the "end" pointer provided (-E + * - memory could not be allocated for the result + */ +static inline char *ceph_extract_encoded_string(void **p, void *end, + size_t *lenp, gfp_t gfp) +{ + u32 len; + void *sp = *p; + char *buf; + + ceph_decode_32_safe(&sp, end, len, bad); + if (!ceph_has_room(&sp, end, len)) + goto bad; + + buf = kmalloc(len + 1, gfp); + if (!buf) + return ERR_PTR(-ENOMEM); + + if (len) + memcpy(buf, sp, len); + buf[len] = '\0'; + + *p = (char *) *p + sizeof (u32) + len; + if (lenp) + *lenp = (size_t) len; + + return buf; + +bad: + return ERR_PTR(-ERANGE); +} + +/* * struct ceph_timespec <-> struct timespec */ static inline void ceph_decode_timespec(struct timespec *ts, @@ -151,7 +198,7 @@ static inline void ceph_encode_filepath(void **p, void *end, u64 ino, const char *path) { u32 len = path ? strlen(path) : 0; - BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end); + BUG_ON(*p + 1 + sizeof(ino) + sizeof(len) + len > end); ceph_encode_8(p, 1); ceph_encode_64(p, ino); ceph_encode_32(p, len); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index e71d683982a6..42624789b06f 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -23,12 +23,6 @@ #include "ceph_fs.h" /* - * Supported features - */ -#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR - -/* * mount options */ #define CEPH_OPT_FSID (1<<0) @@ -132,7 +126,7 @@ struct ceph_client { u32 supported_features; u32 required_features; - struct ceph_messenger *msgr; /* messenger instance */ + struct ceph_messenger msgr; /* messenger instance */ struct ceph_mon_client monc; struct ceph_osd_client osdc; @@ -160,7 +154,7 @@ struct ceph_client { struct ceph_snap_context { atomic_t nref; u64 seq; - int num_snaps; + u32 num_snaps; u64 snaps[]; }; diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 44c87e731e9d..189ae0637634 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -31,9 +31,6 @@ struct ceph_connection_operations { int (*verify_authorizer_reply) (struct ceph_connection *con, int len); int (*invalidate_authorizer)(struct ceph_connection *con); - /* protocol version mismatch */ - void (*bad_proto) (struct ceph_connection *con); - /* there was some error on the socket (disconnect, whatever) */ void (*fault) (struct ceph_connection *con); @@ -53,6 +50,7 @@ struct ceph_messenger { struct ceph_entity_inst inst; /* my name+address */ struct ceph_entity_addr my_enc_addr; + atomic_t stopping; bool nocrc; /* @@ -80,7 +78,10 @@ struct ceph_msg { unsigned nr_pages; /* size of page array */ unsigned page_alignment; /* io offset in first page */ struct ceph_pagelist *pagelist; /* instead of pages */ + + struct ceph_connection *con; struct list_head list_head; + struct kref kref; struct bio *bio; /* instead of pages/pagelist */ struct bio *bio_iter; /* bio iterator */ @@ -106,23 +107,6 @@ struct ceph_msg_pos { #define MAX_DELAY_INTERVAL (5 * 60 * HZ) /* - * ceph_connection state bit flags - */ -#define LOSSYTX 0 /* we can close channel or drop messages on errors */ -#define CONNECTING 1 -#define NEGOTIATING 2 -#define KEEPALIVE_PENDING 3 -#define WRITE_PENDING 4 /* we have data ready to send */ -#define STANDBY 8 /* no outgoing messages, socket closed. we keep - * the ceph_connection around to maintain shared - * state with the peer. */ -#define CLOSED 10 /* we've closed the connection */ -#define SOCK_CLOSED 11 /* socket state changed to closed */ -#define OPENING 13 /* open connection w/ (possibly new) peer */ -#define DEAD 14 /* dead, about to kfree */ -#define BACKOFF 15 - -/* * A single connection with another host. * * We maintain a queue of outgoing messages, and some session state to @@ -131,18 +115,22 @@ struct ceph_msg_pos { */ struct ceph_connection { void *private; - atomic_t nref; const struct ceph_connection_operations *ops; struct ceph_messenger *msgr; + + atomic_t sock_state; struct socket *sock; - unsigned long state; /* connection state (see flags above) */ + struct ceph_entity_addr peer_addr; /* peer address */ + struct ceph_entity_addr peer_addr_for_me; + + unsigned long flags; + unsigned long state; const char *error_msg; /* error message, if any */ - struct ceph_entity_addr peer_addr; /* peer address */ struct ceph_entity_name peer_name; /* peer name */ - struct ceph_entity_addr peer_addr_for_me; + unsigned peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ @@ -207,24 +195,26 @@ extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); extern void ceph_msgr_flush(void); -extern struct ceph_messenger *ceph_messenger_create( - struct ceph_entity_addr *myaddr, - u32 features, u32 required); -extern void ceph_messenger_destroy(struct ceph_messenger *); +extern void ceph_messenger_init(struct ceph_messenger *msgr, + struct ceph_entity_addr *myaddr, + u32 supported_features, + u32 required_features, + bool nocrc); -extern void ceph_con_init(struct ceph_messenger *msgr, - struct ceph_connection *con); +extern void ceph_con_init(struct ceph_connection *con, void *private, + const struct ceph_connection_operations *ops, + struct ceph_messenger *msgr); extern void ceph_con_open(struct ceph_connection *con, + __u8 entity_type, __u64 entity_num, struct ceph_entity_addr *addr); extern bool ceph_con_opened(struct ceph_connection *con); extern void ceph_con_close(struct ceph_connection *con); extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); -extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); -extern void ceph_con_revoke_message(struct ceph_connection *con, - struct ceph_msg *msg); + +extern void ceph_msg_revoke(struct ceph_msg *msg); +extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); + extern void ceph_con_keepalive(struct ceph_connection *con); -extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); -extern void ceph_con_put(struct ceph_connection *con); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 545f85917780..2113e3850a4e 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -70,7 +70,7 @@ struct ceph_mon_client { bool hunting; int cur_mon; /* last monitor i contacted */ unsigned long sub_sent, sub_renew_after; - struct ceph_connection *con; + struct ceph_connection con; bool have_fsid; /* pending generic requests */ diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h index a362605f9368..09fa96b43436 100644 --- a/include/linux/ceph/msgpool.h +++ b/include/linux/ceph/msgpool.h @@ -11,10 +11,11 @@ struct ceph_msgpool { const char *name; mempool_t *pool; + int type; /* preallocated message type */ int front_len; /* preallocated payload size */ }; -extern int ceph_msgpool_init(struct ceph_msgpool *pool, +extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type, int front_len, int size, bool blocking, const char *name); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 7c4750811b96..25baa287cff7 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -154,6 +154,14 @@ struct crush_map { __s32 max_buckets; __u32 max_rules; __s32 max_devices; + + /* choose local retries before re-descent */ + __u32 choose_local_tries; + /* choose local attempts using a fallback permutation before + * re-descent */ + __u32 choose_local_fallback_tries; + /* choose attempts before giving up */ + __u32 choose_total_tries; }; diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h new file mode 100644 index 000000000000..4ebc49fae391 --- /dev/null +++ b/include/linux/flex_proportions.h @@ -0,0 +1,101 @@ +/* + * Floating proportions with flexible aging period + * + * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz> + */ + +#ifndef _LINUX_FLEX_PROPORTIONS_H +#define _LINUX_FLEX_PROPORTIONS_H + +#include <linux/percpu_counter.h> +#include <linux/spinlock.h> +#include <linux/seqlock.h> + +/* + * When maximum proportion of some event type is specified, this is the + * precision with which we allow limitting. Note that this creates an upper + * bound on the number of events per period like + * ULLONG_MAX >> FPROP_FRAC_SHIFT. + */ +#define FPROP_FRAC_SHIFT 10 +#define FPROP_FRAC_BASE (1UL << FPROP_FRAC_SHIFT) + +/* + * ---- Global proportion definitions ---- + */ +struct fprop_global { + /* Number of events in the current period */ + struct percpu_counter events; + /* Current period */ + unsigned int period; + /* Synchronization with period transitions */ + seqcount_t sequence; +}; + +int fprop_global_init(struct fprop_global *p); +void fprop_global_destroy(struct fprop_global *p); +bool fprop_new_period(struct fprop_global *p, int periods); + +/* + * ---- SINGLE ---- + */ +struct fprop_local_single { + /* the local events counter */ + unsigned long events; + /* Period in which we last updated events */ + unsigned int period; + raw_spinlock_t lock; /* Protect period and numerator */ +}; + +#define INIT_FPROP_LOCAL_SINGLE(name) \ +{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ +} + +int fprop_local_init_single(struct fprop_local_single *pl); +void fprop_local_destroy_single(struct fprop_local_single *pl); +void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl); +void fprop_fraction_single(struct fprop_global *p, + struct fprop_local_single *pl, unsigned long *numerator, + unsigned long *denominator); + +static inline +void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) +{ + unsigned long flags; + + local_irq_save(flags); + __fprop_inc_single(p, pl); + local_irq_restore(flags); +} + +/* + * ---- PERCPU ---- + */ +struct fprop_local_percpu { + /* the local events counter */ + struct percpu_counter events; + /* Period in which we last updated events */ + unsigned int period; + raw_spinlock_t lock; /* Protect period and numerator */ +}; + +int fprop_local_init_percpu(struct fprop_local_percpu *pl); +void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); +void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); +void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, + int max_frac); +void fprop_fraction_percpu(struct fprop_global *p, + struct fprop_local_percpu *pl, unsigned long *numerator, + unsigned long *denominator); + +static inline +void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) +{ + unsigned long flags; + + local_irq_save(flags); + __fprop_inc_percpu(p, pl); + local_irq_restore(flags); +} + +#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 8fabb037a48d..b178f9e91e23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1163,9 +1163,10 @@ struct lock_manager { struct list_head list; }; -void locks_start_grace(struct lock_manager *); +struct net; +void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); -int locks_in_grace(void); +int locks_in_grace(struct net *); /* that will die - we need it for nfs_lock_info */ #include <linux/nfs_fs_i.h> diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index f04ce6ac6d04..f5a051a79273 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -262,11 +262,11 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); __be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, int, struct nlm_cookie *, int); -__be32 nlmsvc_unlock(struct nlm_file *, struct nlm_lock *); +__be32 nlmsvc_unlock(struct net *net, struct nlm_file *, struct nlm_lock *); __be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, struct nlm_lock *, struct nlm_cookie *); -__be32 nlmsvc_cancel_blocked(struct nlm_file *, struct nlm_lock *); +__be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *); unsigned long nlmsvc_retry_blocked(void); void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, nlm_host_match_fn_t match); @@ -279,7 +279,7 @@ void nlmsvc_release_call(struct nlm_rqst *); __be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, struct nfs_fh *); void nlm_release_file(struct nlm_file *); -void nlmsvc_mark_resources(void); +void nlmsvc_mark_resources(struct net *); void nlmsvc_free_host_resources(struct nlm_host *); void nlmsvc_invalidate_all(void); diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index f5fd6160dbca..f792794f6634 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -217,14 +217,32 @@ extern int qword_get(char **bpp, char *dest, int bufsize); static inline int get_int(char **bpp, int *anint) { char buf[50]; - char *ep; - int rv; - int len = qword_get(bpp, buf, 50); - if (len < 0) return -EINVAL; - if (len ==0) return -ENOENT; - rv = simple_strtol(buf, &ep, 0); - if (*ep) return -EINVAL; - *anint = rv; + int len = qword_get(bpp, buf, sizeof(buf)); + + if (len < 0) + return -EINVAL; + if (len == 0) + return -ENOENT; + + if (kstrtoint(buf, 0, anint)) + return -EINVAL; + + return 0; +} + +static inline int get_uint(char **bpp, unsigned int *anint) +{ + char buf[50]; + int len = qword_get(bpp, buf, sizeof(buf)); + + if (len < 0) + return -EINVAL; + if (len == 0) + return -ENOENT; + + if (kstrtouint(buf, 0, anint)) + return -EINVAL; + return 0; } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 40e0a273faea..d83db800fe02 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -278,6 +278,8 @@ struct svc_rqst { struct task_struct *rq_task; /* service thread */ }; +#define SVC_NET(svc_rqst) (svc_rqst->rq_xprt->xpt_net) + /* * Rigorous type checking on sockaddr type conversions */ diff --git a/lib/Makefile b/lib/Makefile index 9cb4104f47d9..42d283edc4d3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ - proportions.o prio_heap.o ratelimit.o show_mem.o \ + proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o lib-$(CONFIG_MMU) += ioremap.o diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c new file mode 100644 index 000000000000..c785554f9523 --- /dev/null +++ b/lib/flex_proportions.c @@ -0,0 +1,272 @@ +/* + * Floating proportions with flexible aging period + * + * Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz> + * + * The goal of this code is: Given different types of event, measure proportion + * of each type of event over time. The proportions are measured with + * exponentially decaying history to give smooth transitions. A formula + * expressing proportion of event of type 'j' is: + * + * p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1}) + * + * Where x_{i,j} is j's number of events in i-th last time period and x_i is + * total number of events in i-th last time period. + * + * Note that p_{j}'s are normalised, i.e. + * + * \Sum_{j} p_{j} = 1, + * + * This formula can be straightforwardly computed by maintaing denominator + * (let's call it 'd') and for each event type its numerator (let's call it + * 'n_j'). When an event of type 'j' happens, we simply need to do: + * n_j++; d++; + * + * When a new period is declared, we could do: + * d /= 2 + * for each j + * n_j /= 2 + * + * To avoid iteration over all event types, we instead shift numerator of event + * j lazily when someone asks for a proportion of event j or when event j + * occurs. This can bit trivially implemented by remembering last period in + * which something happened with proportion of type j. + */ +#include <linux/flex_proportions.h> + +int fprop_global_init(struct fprop_global *p) +{ + int err; + + p->period = 0; + /* Use 1 to avoid dealing with periods with 0 events... */ + err = percpu_counter_init(&p->events, 1); + if (err) + return err; + seqcount_init(&p->sequence); + return 0; +} + +void fprop_global_destroy(struct fprop_global *p) +{ + percpu_counter_destroy(&p->events); +} + +/* + * Declare @periods new periods. It is upto the caller to make sure period + * transitions cannot happen in parallel. + * + * The function returns true if the proportions are still defined and false + * if aging zeroed out all events. This can be used to detect whether declaring + * further periods has any effect. + */ +bool fprop_new_period(struct fprop_global *p, int periods) +{ + u64 events; + unsigned long flags; + + local_irq_save(flags); + events = percpu_counter_sum(&p->events); + /* + * Don't do anything if there are no events. + */ + if (events <= 1) { + local_irq_restore(flags); + return false; + } + write_seqcount_begin(&p->sequence); + if (periods < 64) + events -= events >> periods; + /* Use addition to avoid losing events happening between sum and set */ + percpu_counter_add(&p->events, -events); + p->period += periods; + write_seqcount_end(&p->sequence); + local_irq_restore(flags); + + return true; +} + +/* + * ---- SINGLE ---- + */ + +int fprop_local_init_single(struct fprop_local_single *pl) +{ + pl->events = 0; + pl->period = 0; + raw_spin_lock_init(&pl->lock); + return 0; +} + +void fprop_local_destroy_single(struct fprop_local_single *pl) +{ +} + +static void fprop_reflect_period_single(struct fprop_global *p, + struct fprop_local_single *pl) +{ + unsigned int period = p->period; + unsigned long flags; + + /* Fast path - period didn't change */ + if (pl->period == period) + return; + raw_spin_lock_irqsave(&pl->lock, flags); + /* Someone updated pl->period while we were spinning? */ + if (pl->period >= period) { + raw_spin_unlock_irqrestore(&pl->lock, flags); + return; + } + /* Aging zeroed our fraction? */ + if (period - pl->period < BITS_PER_LONG) + pl->events >>= period - pl->period; + else + pl->events = 0; + pl->period = period; + raw_spin_unlock_irqrestore(&pl->lock, flags); +} + +/* Event of type pl happened */ +void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) +{ + fprop_reflect_period_single(p, pl); + pl->events++; + percpu_counter_add(&p->events, 1); +} + +/* Return fraction of events of type pl */ +void fprop_fraction_single(struct fprop_global *p, + struct fprop_local_single *pl, + unsigned long *numerator, unsigned long *denominator) +{ + unsigned int seq; + s64 num, den; + + do { + seq = read_seqcount_begin(&p->sequence); + fprop_reflect_period_single(p, pl); + num = pl->events; + den = percpu_counter_read_positive(&p->events); + } while (read_seqcount_retry(&p->sequence, seq)); + + /* + * Make fraction <= 1 and denominator > 0 even in presence of percpu + * counter errors + */ + if (den <= num) { + if (num) + den = num; + else + den = 1; + } + *denominator = den; + *numerator = num; +} + +/* + * ---- PERCPU ---- + */ +#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) + +int fprop_local_init_percpu(struct fprop_local_percpu *pl) +{ + int err; + + err = percpu_counter_init(&pl->events, 0); + if (err) + return err; + pl->period = 0; + raw_spin_lock_init(&pl->lock); + return 0; +} + +void fprop_local_destroy_percpu(struct fprop_local_percpu *pl) +{ + percpu_counter_destroy(&pl->events); +} + +static void fprop_reflect_period_percpu(struct fprop_global *p, + struct fprop_local_percpu *pl) +{ + unsigned int period = p->period; + unsigned long flags; + + /* Fast path - period didn't change */ + if (pl->period == period) + return; + raw_spin_lock_irqsave(&pl->lock, flags); + /* Someone updated pl->period while we were spinning? */ + if (pl->period >= period) { + raw_spin_unlock_irqrestore(&pl->lock, flags); + return; + } + /* Aging zeroed our fraction? */ + if (period - pl->period < BITS_PER_LONG) { + s64 val = percpu_counter_read(&pl->events); + + if (val < (nr_cpu_ids * PROP_BATCH)) + val = percpu_counter_sum(&pl->events); + + __percpu_counter_add(&pl->events, + -val + (val >> (period-pl->period)), PROP_BATCH); + } else + percpu_counter_set(&pl->events, 0); + pl->period = period; + raw_spin_unlock_irqrestore(&pl->lock, flags); +} + +/* Event of type pl happened */ +void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) +{ + fprop_reflect_period_percpu(p, pl); + __percpu_counter_add(&pl->events, 1, PROP_BATCH); + percpu_counter_add(&p->events, 1); +} + +void fprop_fraction_percpu(struct fprop_global *p, + struct fprop_local_percpu *pl, + unsigned long *numerator, unsigned long *denominator) +{ + unsigned int seq; + s64 num, den; + + do { + seq = read_seqcount_begin(&p->sequence); + fprop_reflect_period_percpu(p, pl); + num = percpu_counter_read_positive(&pl->events); + den = percpu_counter_read_positive(&p->events); + } while (read_seqcount_retry(&p->sequence, seq)); + + /* + * Make fraction <= 1 and denominator > 0 even in presence of percpu + * counter errors + */ + if (den <= num) { + if (num) + den = num; + else + den = 1; + } + *denominator = den; + *numerator = num; +} + +/* + * Like __fprop_inc_percpu() except that event is counted only if the given + * type has fraction smaller than @max_frac/FPROP_FRAC_BASE + */ +void __fprop_inc_percpu_max(struct fprop_global *p, + struct fprop_local_percpu *pl, int max_frac) +{ + if (unlikely(max_frac < FPROP_FRAC_BASE)) { + unsigned long numerator, denominator; + + fprop_fraction_percpu(p, pl, &numerator, &denominator); + if (numerator > + (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT) + return; + } else + fprop_reflect_period_percpu(p, pl); + __percpu_counter_add(&pl->events, 1, PROP_BATCH); + percpu_counter_add(&p->events, 1); +} diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8e2aafb07e..3387aea11209 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->min_ratio = 0; bdi->max_ratio = 100; - bdi->max_prop_frac = PROP_FRAC_BASE; + bdi->max_prop_frac = FPROP_FRAC_BASE; spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->work_list); @@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = prop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions); if (err) { err: @@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi) for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); - prop_local_destroy_percpu(&bdi->completions); + fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 93d8d2f7108c..e5363f34e025 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,6 +34,7 @@ #include <linux/syscalls.h> #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> +#include <linux/timer.h> #include <trace/events/writeback.h> /* @@ -135,7 +136,20 @@ unsigned long global_dirty_limit; * measured in page writeback completions. * */ -static struct prop_descriptor vm_completions; +static struct fprop_global writeout_completions; + +static void writeout_period(unsigned long t); +/* Timer for aging of writeout_completions */ +static struct timer_list writeout_period_timer = + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); +static unsigned long writeout_period_time = 0; + +/* + * Length of period for aging writeout fractions of bdis. This is an + * arbitrarily chosen number. The longer the period, the slower fractions will + * reflect changes in current writeout rate. + */ +#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) /* * Work out the current dirty-memory clamping and background writeout @@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone) zone_page_state(zone, NR_WRITEBACK) <= limit; } -/* - * couple the period to the dirty_ratio: - * - * period/2 ~ roundup_pow_of_two(dirty limit) - */ -static int calc_period_shift(void) -{ - unsigned long dirty_total; - - if (vm_dirty_bytes) - dirty_total = vm_dirty_bytes / PAGE_SIZE; - else - dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / - 100; - return 2 + ilog2(dirty_total - 1); -} - -/* - * update the period when the dirty threshold changes. - */ -static void update_completion_period(void) -{ - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - - writeback_set_ratelimit(); -} - int dirty_background_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - update_completion_period(); + writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; @@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { - update_completion_period(); + writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } +static unsigned long wp_next_time(unsigned long cur_time) +{ + cur_time += VM_COMPLETIONS_PERIOD_LEN; + /* 0 has a special meaning... */ + if (!cur_time) + return 1; + return cur_time; +} + /* * Increment the BDI's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). @@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { __inc_bdi_stat(bdi, BDI_WRITTEN); - __prop_inc_percpu_max(&vm_completions, &bdi->completions, - bdi->max_prop_frac); + __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, + bdi->max_prop_frac); + /* First event after period switching was turned off? */ + if (!unlikely(writeout_period_time)) { + /* + * We can race with other __bdi_writeout_inc calls here but + * it does not cause any harm since the resulting time when + * timer will fire and what is in writeout_period_time will be + * roughly the same. + */ + writeout_period_time = wp_next_time(jiffies); + mod_timer(&writeout_period_timer, writeout_period_time); + } } void bdi_writeout_inc(struct backing_dev_info *bdi) @@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); static void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - prop_fraction_percpu(&vm_completions, &bdi->completions, + fprop_fraction_percpu(&writeout_completions, &bdi->completions, numerator, denominator); } /* + * On idle system, we can be called long after we scheduled because we use + * deferred timers so count with missed periods. + */ +static void writeout_period(unsigned long t) +{ + int miss_periods = (jiffies - writeout_period_time) / + VM_COMPLETIONS_PERIOD_LEN; + + if (fprop_new_period(&writeout_completions, miss_periods + 1)) { + writeout_period_time = wp_next_time(writeout_period_time + + miss_periods * VM_COMPLETIONS_PERIOD_LEN); + mod_timer(&writeout_period_timer, writeout_period_time); + } else { + /* + * Aging has zeroed all fractions. Stop wasting CPU on period + * updates. + */ + writeout_period_time = 0; + } +} + +/* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. @@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) ret = -EINVAL; } else { bdi->max_ratio = max_ratio; - bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_bh(&bdi_lock); @@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * bdi->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated - * code makes use of task_ratelimit to filter out sigular points and + * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of @@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size - * and filter out the sigular points of balanced_dirty_ratelimit. Which + * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). @@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { */ void __init page_writeback_init(void) { - int shift; - writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - shift = calc_period_shift(); - prop_descriptor_init(&vm_completions, shift); + fprop_global_init(&writeout_completions); } /** diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ba4323bce0e9..69e38db28e5f 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -17,6 +17,7 @@ #include <linux/string.h> +#include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/debugfs.h> #include <linux/ceph/decode.h> @@ -460,27 +461,23 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, client->auth_err = 0; client->extra_mon_dispatch = NULL; - client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | + client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT | supported_features; - client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | + client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT | required_features; /* msgr */ if (ceph_test_opt(client, MYIP)) myaddr = &client->options->my_addr; - client->msgr = ceph_messenger_create(myaddr, - client->supported_features, - client->required_features); - if (IS_ERR(client->msgr)) { - err = PTR_ERR(client->msgr); - goto fail; - } - client->msgr->nocrc = ceph_test_opt(client, NOCRC); + ceph_messenger_init(&client->msgr, myaddr, + client->supported_features, + client->required_features, + ceph_test_opt(client, NOCRC)); /* subsystems */ err = ceph_monc_init(&client->monc, client); if (err < 0) - goto fail_msgr; + goto fail; err = ceph_osdc_init(&client->osdc, client); if (err < 0) goto fail_monc; @@ -489,8 +486,6 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, fail_monc: ceph_monc_stop(&client->monc); -fail_msgr: - ceph_messenger_destroy(client->msgr); fail: kfree(client); return ERR_PTR(err); @@ -501,6 +496,8 @@ void ceph_destroy_client(struct ceph_client *client) { dout("destroy_client %p\n", client); + atomic_set(&client->msgr.stopping, 1); + /* unmount */ ceph_osdc_stop(&client->osdc); @@ -508,8 +505,6 @@ void ceph_destroy_client(struct ceph_client *client) ceph_debugfs_client_cleanup(client); - ceph_messenger_destroy(client->msgr); - ceph_destroy_options(client->options); kfree(client); diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index d7edc24333b8..35fce755ce10 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map, int item = 0; int itemtype; int collide, reject; - const unsigned int orig_tries = 5; /* attempts before we fall back to search */ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep); @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map, reject = 1; goto reject; } - if (flocal >= (in->size>>1) && - flocal > orig_tries) + if (map->choose_local_fallback_tries > 0 && + flocal >= (in->size>>1) && + flocal > map->choose_local_fallback_tries) item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); @@ -422,13 +422,14 @@ reject: ftotal++; flocal++; - if (collide && flocal < 3) + if (collide && flocal <= map->choose_local_tries) /* retry locally a few times */ retry_bucket = 1; - else if (flocal <= in->size + orig_tries) + else if (map->choose_local_fallback_tries > 0 && + flocal <= in->size + map->choose_local_fallback_tries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal < 20) + else if (ftotal <= map->choose_total_tries) /* then retry descent */ retry_descent = 1; else diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 10255e81be79..b9796750034a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -29,6 +29,74 @@ * the sender. */ +/* + * We track the state of the socket on a given connection using + * values defined below. The transition to a new socket state is + * handled by a function which verifies we aren't coming from an + * unexpected state. + * + * -------- + * | NEW* | transient initial state + * -------- + * | con_sock_state_init() + * v + * ---------- + * | CLOSED | initialized, but no socket (and no + * ---------- TCP connection) + * ^ \ + * | \ con_sock_state_connecting() + * | ---------------------- + * | \ + * + con_sock_state_closed() \ + * |+--------------------------- \ + * | \ \ \ + * | ----------- \ \ + * | | CLOSING | socket event; \ \ + * | ----------- await close \ \ + * | ^ \ | + * | | \ | + * | + con_sock_state_closing() \ | + * | / \ | | + * | / --------------- | | + * | / \ v v + * | / -------------- + * | / -----------------| CONNECTING | socket created, TCP + * | | / -------------- connect initiated + * | | | con_sock_state_connected() + * | | v + * ------------- + * | CONNECTED | TCP connection established + * ------------- + * + * State values for ceph_connection->sock_state; NEW is assumed to be 0. + */ + +#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ +#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ +#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ +#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ +#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ + +/* + * connection states + */ +#define CON_STATE_CLOSED 1 /* -> PREOPEN */ +#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ +#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ +#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ +#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ +#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ + +/* + * ceph_connection flag bits + */ +#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop + * messages on errors */ +#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ +#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ +#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ +#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ + /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -147,72 +215,130 @@ void ceph_msgr_flush(void) } EXPORT_SYMBOL(ceph_msgr_flush); +/* Connection socket state transition functions */ + +static void con_sock_state_init(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); + if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSED); +} + +static void con_sock_state_connecting(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); + if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CONNECTING); +} + +static void con_sock_state_connected(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CONNECTED); +} + +static void con_sock_state_closing(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && + old_state != CON_SOCK_STATE_CONNECTED && + old_state != CON_SOCK_STATE_CLOSING)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSING); +} + +static void con_sock_state_closed(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && + old_state != CON_SOCK_STATE_CLOSING && + old_state != CON_SOCK_STATE_CONNECTING && + old_state != CON_SOCK_STATE_CLOSED)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSED); +} /* * socket callback functions */ /* data available on socket, or listen socket received a connect */ -static void ceph_data_ready(struct sock *sk, int count_unused) +static void ceph_sock_data_ready(struct sock *sk, int count_unused) { struct ceph_connection *con = sk->sk_user_data; + if (atomic_read(&con->msgr->stopping)) { + return; + } if (sk->sk_state != TCP_CLOSE_WAIT) { - dout("ceph_data_ready on %p state = %lu, queueing work\n", + dout("%s on %p state = %lu, queueing work\n", __func__, con, con->state); queue_con(con); } } /* socket has buffer space for writing */ -static void ceph_write_space(struct sock *sk) +static void ceph_sock_write_space(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept - * more data. clear SOCK_NOSPACE so that ceph_write_space() + * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() * doesn't get called again until try_write() fills the socket * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ - if (test_bit(WRITE_PENDING, &con->state)) { + if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { - dout("ceph_write_space %p queueing write work\n", con); + dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_con(con); } } else { - dout("ceph_write_space %p nothing to write\n", con); + dout("%s %p nothing to write\n", __func__, con); } } /* socket's state has changed */ -static void ceph_state_change(struct sock *sk) +static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; - dout("ceph_state_change %p state = %lu sk_state = %u\n", + dout("%s %p state = %lu sk_state = %u\n", __func__, con, con->state, sk->sk_state); - if (test_bit(CLOSED, &con->state)) - return; - switch (sk->sk_state) { case TCP_CLOSE: - dout("ceph_state_change TCP_CLOSE\n"); + dout("%s TCP_CLOSE\n", __func__); case TCP_CLOSE_WAIT: - dout("ceph_state_change TCP_CLOSE_WAIT\n"); - if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { - if (test_bit(CONNECTING, &con->state)) - con->error_msg = "connection failed"; - else - con->error_msg = "socket closed"; - queue_con(con); - } + dout("%s TCP_CLOSE_WAIT\n", __func__); + con_sock_state_closing(con); + set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + queue_con(con); break; case TCP_ESTABLISHED: - dout("ceph_state_change TCP_ESTABLISHED\n"); + dout("%s TCP_ESTABLISHED\n", __func__); + con_sock_state_connected(con); queue_con(con); break; default: /* Everything else is uninteresting */ @@ -228,9 +354,9 @@ static void set_sock_callbacks(struct socket *sock, { struct sock *sk = sock->sk; sk->sk_user_data = con; - sk->sk_data_ready = ceph_data_ready; - sk->sk_write_space = ceph_write_space; - sk->sk_state_change = ceph_state_change; + sk->sk_data_ready = ceph_sock_data_ready; + sk->sk_write_space = ceph_sock_write_space; + sk->sk_state_change = ceph_sock_state_change; } @@ -262,6 +388,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); + con_sock_state_connecting(con); ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), O_NONBLOCK); if (ret == -EINPROGRESS) { @@ -277,7 +404,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } con->sock = sock; - return 0; } @@ -333,16 +459,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, */ static int con_close_socket(struct ceph_connection *con) { - int rc; + int rc = 0; dout("con_close_socket on %p sock %p\n", con, con->sock); - if (!con->sock) - return 0; - set_bit(SOCK_CLOSED, &con->state); - rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); - sock_release(con->sock); - con->sock = NULL; - clear_bit(SOCK_CLOSED, &con->state); + if (con->sock) { + rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); + sock_release(con->sock); + con->sock = NULL; + } + + /* + * Forcibly clear the SOCK_CLOSED flag. It gets set + * independent of the connection mutex, and we could have + * received a socket close event before we had the chance to + * shut the socket down. + */ + clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + + con_sock_state_closed(con); return rc; } @@ -353,6 +487,10 @@ static int con_close_socket(struct ceph_connection *con) static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); + BUG_ON(msg->con == NULL); + msg->con->ops->put(msg->con); + msg->con = NULL; + ceph_msg_put(msg); } static void ceph_msg_remove_list(struct list_head *head) @@ -372,8 +510,11 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_remove_list(&con->out_sent); if (con->in_msg) { + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; + con->ops->put(con); } con->connect_seq = 0; @@ -391,32 +532,44 @@ static void reset_connection(struct ceph_connection *con) */ void ceph_con_close(struct ceph_connection *con) { + mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr.in_addr)); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ - clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ - clear_bit(KEEPALIVE_PENDING, &con->state); - clear_bit(WRITE_PENDING, &con->state); - mutex_lock(&con->mutex); + con->state = CON_STATE_CLOSED; + + clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ + clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); + clear_bit(CON_FLAG_BACKOFF, &con->flags); + reset_connection(con); con->peer_global_seq = 0; cancel_delayed_work(&con->work); + con_close_socket(con); mutex_unlock(&con->mutex); - queue_con(con); } EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ -void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) +void ceph_con_open(struct ceph_connection *con, + __u8 entity_type, __u64 entity_num, + struct ceph_entity_addr *addr) { + mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); - set_bit(OPENING, &con->state); - clear_bit(CLOSED, &con->state); + + BUG_ON(con->state != CON_STATE_CLOSED); + con->state = CON_STATE_PREOPEN; + + con->peer_name.type = (__u8) entity_type; + con->peer_name.num = cpu_to_le64(entity_num); + memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ + mutex_unlock(&con->mutex); queue_con(con); } EXPORT_SYMBOL(ceph_con_open); @@ -430,42 +583,26 @@ bool ceph_con_opened(struct ceph_connection *con) } /* - * generic get/put - */ -struct ceph_connection *ceph_con_get(struct ceph_connection *con) -{ - int nref = __atomic_add_unless(&con->nref, 1, 0); - - dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1); - - return nref ? con : NULL; -} - -void ceph_con_put(struct ceph_connection *con) -{ - int nref = atomic_dec_return(&con->nref); - - BUG_ON(nref < 0); - if (nref == 0) { - BUG_ON(con->sock); - kfree(con); - } - dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref); -} - -/* * initialize a new connection. */ -void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) +void ceph_con_init(struct ceph_connection *con, void *private, + const struct ceph_connection_operations *ops, + struct ceph_messenger *msgr) { dout("con_init %p\n", con); memset(con, 0, sizeof(*con)); - atomic_set(&con->nref, 1); + con->private = private; + con->ops = ops; con->msgr = msgr; + + con_sock_state_init(con); + mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); + + con->state = CON_STATE_CLOSED; } EXPORT_SYMBOL(ceph_con_init); @@ -486,14 +623,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) return ret; } -static void ceph_con_out_kvec_reset(struct ceph_connection *con) +static void con_out_kvec_reset(struct ceph_connection *con) { con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; } -static void ceph_con_out_kvec_add(struct ceph_connection *con, +static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { int index; @@ -507,6 +644,53 @@ static void ceph_con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +#ifdef CONFIG_BLOCK +static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +{ + if (!bio) { + *iter = NULL; + *seg = 0; + return; + } + *iter = bio; + *seg = bio->bi_idx; +} + +static void iter_bio_next(struct bio **bio_iter, int *seg) +{ + if (*bio_iter == NULL) + return; + + BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + + (*seg)++; + if (*seg == (*bio_iter)->bi_vcnt) + init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +} +#endif + +static void prepare_write_message_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + BUG_ON(!msg); + BUG_ON(!msg->hdr.data_len); + + /* initialize page iterator */ + con->out_msg_pos.page = 0; + if (msg->pages) + con->out_msg_pos.page_pos = msg->page_alignment; + else + con->out_msg_pos.page_pos = 0; +#ifdef CONFIG_BLOCK + if (msg->bio) + init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); +#endif + con->out_msg_pos.data_pos = 0; + con->out_msg_pos.did_page_crc = false; + con->out_more = 1; /* data + footer will follow */ +} + /* * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. @@ -516,6 +700,8 @@ static void prepare_write_message_footer(struct ceph_connection *con) struct ceph_msg *m = con->out_msg; int v = con->out_kvec_left; + m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; + dout("prepare_write_message_footer %p\n", con); con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; @@ -534,7 +720,7 @@ static void prepare_write_message(struct ceph_connection *con) struct ceph_msg *m; u32 crc; - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); con->out_kvec_is_msg = true; con->out_msg_done = false; @@ -542,14 +728,16 @@ static void prepare_write_message(struct ceph_connection *con) * TCP packet that's a good thing. */ if (con->in_seq > con->in_seq_acked) { con->in_seq_acked = con->in_seq; - ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); } + BUG_ON(list_empty(&con->out_queue)); m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); con->out_msg = m; + BUG_ON(m->con != con); /* put message on sent list */ ceph_msg_get(m); @@ -576,18 +764,18 @@ static void prepare_write_message(struct ceph_connection *con) BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ - ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); - ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) - ceph_con_out_kvec_add(con, m->middle->vec.iov_len, + con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); /* fill in crc (except data pages), footer */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; + con->out_msg->footer.flags = 0; crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); @@ -597,28 +785,19 @@ static void prepare_write_message(struct ceph_connection *con) con->out_msg->footer.middle_crc = cpu_to_le32(crc); } else con->out_msg->footer.middle_crc = 0; - con->out_msg->footer.data_crc = 0; - dout("prepare_write_message front_crc %u data_crc %u\n", + dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); /* is there a data payload? */ - if (le32_to_cpu(m->hdr.data_len) > 0) { - /* initialize page iterator */ - con->out_msg_pos.page = 0; - if (m->pages) - con->out_msg_pos.page_pos = m->page_alignment; - else - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = false; - con->out_more = 1; /* data + footer will follow */ - } else { + con->out_msg->footer.data_crc = 0; + if (m->hdr.data_len) + prepare_write_message_data(con); + else /* no, queue up footer too and be done */ prepare_write_message_footer(con); - } - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -630,16 +809,16 @@ static void prepare_write_ack(struct ceph_connection *con) con->in_seq_acked, con->in_seq); con->in_seq_acked = con->in_seq; - ceph_con_out_kvec_reset(con); + con_out_kvec_reset(con); - ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), + con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); con->out_more = 1; /* more will follow.. eventually.. */ - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -648,9 +827,9 @@ static void prepare_write_ack(struct ceph_connection *con) static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); - ceph_con_out_kvec_reset(con); - ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); - set_bit(WRITE_PENDING, &con->state); + con_out_kvec_reset(con); + con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -665,27 +844,21 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection if (!con->ops->get_authorizer) { con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; con->out_connect.authorizer_len = 0; - return NULL; } /* Can't hold the mutex while getting authorizer */ - mutex_unlock(&con->mutex); - auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); - mutex_lock(&con->mutex); if (IS_ERR(auth)) return auth; - if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) + if (con->state != CON_STATE_NEGOTIATING) return ERR_PTR(-EAGAIN); con->auth_reply_buf = auth->authorizer_reply_buf; con->auth_reply_buf_len = auth->authorizer_reply_buf_len; - - return auth; } @@ -694,12 +867,12 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection */ static void prepare_write_banner(struct ceph_connection *con) { - ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); - ceph_con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), + con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), &con->msgr->my_enc_addr); con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } static int prepare_write_connect(struct ceph_connection *con) @@ -742,14 +915,15 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.authorizer_len = auth ? cpu_to_le32(auth->authorizer_buf_len) : 0; - ceph_con_out_kvec_add(con, sizeof (con->out_connect), + con_out_kvec_reset(con); + con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); if (auth && auth->authorizer_buf_len) - ceph_con_out_kvec_add(con, auth->authorizer_buf_len, + con_out_kvec_add(con, auth->authorizer_buf_len, auth->authorizer_buf); con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); return 0; } @@ -797,30 +971,34 @@ out: return ret; /* done! */ } -#ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +static void out_msg_pos_next(struct ceph_connection *con, struct page *page, + size_t len, size_t sent, bool in_trail) { - if (!bio) { - *iter = NULL; - *seg = 0; - return; - } - *iter = bio; - *seg = bio->bi_idx; -} + struct ceph_msg *msg = con->out_msg; -static void iter_bio_next(struct bio **bio_iter, int *seg) -{ - if (*bio_iter == NULL) - return; + BUG_ON(!msg); + BUG_ON(!sent); - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + con->out_msg_pos.data_pos += sent; + con->out_msg_pos.page_pos += sent; + if (sent < len) + return; - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); -} + BUG_ON(sent != len); + con->out_msg_pos.page_pos = 0; + con->out_msg_pos.page++; + con->out_msg_pos.did_page_crc = false; + if (in_trail) + list_move_tail(&page->lru, + &msg->trail->head); + else if (msg->pagelist) + list_move_tail(&page->lru, + &msg->pagelist->head); +#ifdef CONFIG_BLOCK + else if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); #endif +} /* * Write as much message data payload as we can. If we finish, queue @@ -837,41 +1015,36 @@ static int write_partial_msg_pages(struct ceph_connection *con) bool do_datacrc = !con->msgr->nocrc; int ret; int total_max_write; - int in_trail = 0; - size_t trail_len = (msg->trail ? msg->trail->length : 0); + bool in_trail = false; + const size_t trail_len = (msg->trail ? msg->trail->length : 0); + const size_t trail_off = data_len - trail_len; dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, + con, msg, con->out_msg_pos.page, msg->nr_pages, con->out_msg_pos.page_pos); -#ifdef CONFIG_BLOCK - if (msg->bio && !msg->bio_iter) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif - + /* + * Iterate through each page that contains data to be + * written, and send as much as possible for each. + * + * If we are calculating the data crc (the default), we will + * need to map the page. If we have no pages, they have + * been revoked, so use the zero page. + */ while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; int max_write = PAGE_SIZE; int bio_offset = 0; - total_max_write = data_len - trail_len - - con->out_msg_pos.data_pos; - - /* - * if we are calculating the data crc (the default), we need - * to map the page. if our pages[] has been revoked, use the - * zero page. - */ - - /* have we reached the trail part of the data? */ - if (con->out_msg_pos.data_pos >= data_len - trail_len) { - in_trail = 1; + in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; + if (!in_trail) + total_max_write = trail_off - con->out_msg_pos.data_pos; + if (in_trail) { total_max_write = data_len - con->out_msg_pos.data_pos; page = list_first_entry(&msg->trail->head, struct page, lru); - max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; } else if (msg->pagelist) { @@ -894,15 +1067,14 @@ static int write_partial_msg_pages(struct ceph_connection *con) if (do_datacrc && !con->out_msg_pos.did_page_crc) { void *base; - u32 crc; - u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + u32 crc = le32_to_cpu(msg->footer.data_crc); char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); base = kaddr + con->out_msg_pos.page_pos + bio_offset; - crc = crc32c(tmpcrc, base, len); - con->out_msg->footer.data_crc = cpu_to_le32(crc); + crc = crc32c(crc, base, len); + msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } ret = ceph_tcp_sendpage(con->sock, page, @@ -915,31 +1087,15 @@ static int write_partial_msg_pages(struct ceph_connection *con) if (ret <= 0) goto out; - con->out_msg_pos.data_pos += ret; - con->out_msg_pos.page_pos += ret; - if (ret == len) { - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = false; - if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); - else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); -#ifdef CONFIG_BLOCK - else if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif - } + out_msg_pos_next(con, page, len, (size_t) ret, in_trail); } dout("write_partial_msg_pages %p msg %p done\n", con, msg); /* prepare and queue up footer, too */ if (!do_datacrc) - con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - ceph_con_out_kvec_reset(con); + msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con_out_kvec_reset(con); prepare_write_message_footer(con); ret = 1; out: @@ -1351,20 +1507,14 @@ static int process_banner(struct ceph_connection *con) ceph_pr_addr(&con->msgr->inst.addr.in_addr)); } - set_bit(NEGOTIATING, &con->state); - prepare_read_connect(con); return 0; } static void fail_protocol(struct ceph_connection *con) { reset_connection(con); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - - mutex_unlock(&con->mutex); - if (con->ops->bad_proto) - con->ops->bad_proto(con); - mutex_lock(&con->mutex); + BUG_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_CLOSED; } static int process_connect(struct ceph_connection *con) @@ -1407,7 +1557,6 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; - ceph_con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1428,7 +1577,6 @@ static int process_connect(struct ceph_connection *con) ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); - ceph_con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1440,8 +1588,7 @@ static int process_connect(struct ceph_connection *con) if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) + if (con->state != CON_STATE_NEGOTIATING) return -EAGAIN; break; @@ -1454,7 +1601,6 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.connect_seq), le32_to_cpu(con->in_reply.connect_seq)); con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); - ceph_con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1471,7 +1617,6 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, le32_to_cpu(con->in_reply.global_seq)); - ceph_con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1489,7 +1634,10 @@ static int process_connect(struct ceph_connection *con) fail_protocol(con); return -1; } - clear_bit(CONNECTING, &con->state); + + BUG_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_OPEN; + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; @@ -1501,7 +1649,9 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.connect_seq)); if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - set_bit(LOSSYTX, &con->state); + set_bit(CON_FLAG_LOSSYTX, &con->flags); + + con->delay = 0; /* reset backoff memory */ prepare_read_tag(con); break; @@ -1587,10 +1737,7 @@ static int read_partial_message_section(struct ceph_connection *con, return 1; } -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip); - +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); static int read_partial_message_pages(struct ceph_connection *con, struct page **pages, @@ -1633,9 +1780,6 @@ static int read_partial_message_bio(struct ceph_connection *con, void *p; int ret, left; - if (IS_ERR(bv)) - return PTR_ERR(bv); - left = min((int)(data_len - con->in_msg_pos.data_pos), (int)(bv->bv_len - con->in_msg_pos.page_pos)); @@ -1672,7 +1816,6 @@ static int read_partial_message(struct ceph_connection *con) int ret; unsigned int front_len, middle_len, data_len; bool do_datacrc = !con->msgr->nocrc; - int skip; u64 seq; u32 crc; @@ -1723,10 +1866,13 @@ static int read_partial_message(struct ceph_connection *con) /* allocate message? */ if (!con->in_msg) { + int skip = 0; + dout("got hdr type %d front %d data %d\n", con->in_hdr.type, con->in_hdr.front_len, con->in_hdr.data_len); - skip = 0; - con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); + ret = ceph_con_in_msg_alloc(con, &skip); + if (ret < 0) + return ret; if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); @@ -1737,11 +1883,9 @@ static int read_partial_message(struct ceph_connection *con) con->in_seq++; return 0; } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; - return -ENOMEM; - } + + BUG_ON(!con->in_msg); + BUG_ON(con->in_msg->con != con); m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ if (m->middle) @@ -1753,6 +1897,11 @@ static int read_partial_message(struct ceph_connection *con) else con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; + +#ifdef CONFIG_BLOCK + if (m->bio) + init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); +#endif } /* front */ @@ -1769,10 +1918,6 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; } -#ifdef CONFIG_BLOCK - if (m->bio && !m->bio_iter) - init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif /* (page) data */ while (con->in_msg_pos.data_pos < data_len) { @@ -1783,7 +1928,7 @@ static int read_partial_message(struct ceph_connection *con) return ret; #ifdef CONFIG_BLOCK } else if (m->bio) { - + BUG_ON(!m->bio_iter); ret = read_partial_message_bio(con, &m->bio_iter, &m->bio_seg, data_len, do_datacrc); @@ -1837,8 +1982,11 @@ static void process_message(struct ceph_connection *con) { struct ceph_msg *msg; + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; msg = con->in_msg; con->in_msg = NULL; + con->ops->put(con); /* if first message, set peer_name */ if (con->peer_name.type == 0) @@ -1858,7 +2006,6 @@ static void process_message(struct ceph_connection *con) con->ops->dispatch(con, msg); mutex_lock(&con->mutex); - prepare_read_tag(con); } @@ -1870,22 +2017,19 @@ static int try_write(struct ceph_connection *con) { int ret = 1; - dout("try_write start %p state %lu nref %d\n", con, con->state, - atomic_read(&con->nref)); + dout("try_write start %p state %lu\n", con, con->state); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); /* open the socket first? */ - if (con->sock == NULL) { - ceph_con_out_kvec_reset(con); + if (con->state == CON_STATE_PREOPEN) { + BUG_ON(con->sock); + con->state = CON_STATE_CONNECTING; + + con_out_kvec_reset(con); prepare_write_banner(con); - ret = prepare_write_connect(con); - if (ret < 0) - goto out; prepare_read_banner(con); - set_bit(CONNECTING, &con->state); - clear_bit(NEGOTIATING, &con->state); BUG_ON(con->in_msg); con->in_tag = CEPH_MSGR_TAG_READY; @@ -1932,7 +2076,7 @@ more_kvec: } do_next: - if (!test_bit(CONNECTING, &con->state)) { + if (con->state == CON_STATE_OPEN) { /* is anything else pending? */ if (!list_empty(&con->out_queue)) { prepare_write_message(con); @@ -1942,14 +2086,15 @@ do_next: prepare_write_ack(con); goto more; } - if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { + if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, + &con->flags)) { prepare_write_keepalive(con); goto more; } } /* Nothing to do! */ - clear_bit(WRITE_PENDING, &con->state); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); dout("try_write nothing else to write.\n"); ret = 0; out: @@ -1966,38 +2111,42 @@ static int try_read(struct ceph_connection *con) { int ret = -1; - if (!con->sock) - return 0; - - if (test_bit(STANDBY, &con->state)) +more: + dout("try_read start on %p state %lu\n", con, con->state); + if (con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN) return 0; - dout("try_read start on %p\n", con); + BUG_ON(!con->sock); -more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); - /* - * process_connect and process_message drop and re-take - * con->mutex. make sure we handle a racing close or reopen. - */ - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) { - ret = -EAGAIN; + if (con->state == CON_STATE_CONNECTING) { + dout("try_read connecting\n"); + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + + BUG_ON(con->state != CON_STATE_CONNECTING); + con->state = CON_STATE_NEGOTIATING; + + /* Banner is good, exchange connection info */ + ret = prepare_write_connect(con); + if (ret < 0) + goto out; + prepare_read_connect(con); + + /* Send connection info before awaiting response */ goto out; } - if (test_bit(CONNECTING, &con->state)) { - if (!test_bit(NEGOTIATING, &con->state)) { - dout("try_read connecting\n"); - ret = read_partial_banner(con); - if (ret <= 0) - goto out; - ret = process_banner(con); - if (ret < 0) - goto out; - } + if (con->state == CON_STATE_NEGOTIATING) { + dout("try_read negotiating\n"); ret = read_partial_connect(con); if (ret <= 0) goto out; @@ -2007,6 +2156,8 @@ more: goto more; } + BUG_ON(con->state != CON_STATE_OPEN); + if (con->in_base_pos < 0) { /* * skipping + discarding content. @@ -2040,7 +2191,8 @@ more: prepare_read_ack(con); break; case CEPH_MSGR_TAG_CLOSE: - set_bit(CLOSED, &con->state); /* fixme */ + con_close_socket(con); + con->state = CON_STATE_CLOSED; goto out; default: goto bad_tag; @@ -2063,6 +2215,8 @@ more: if (con->in_tag == CEPH_MSGR_TAG_READY) goto more; process_message(con); + if (con->state == CON_STATE_OPEN) + prepare_read_tag(con); goto more; } if (con->in_tag == CEPH_MSGR_TAG_ACK) { @@ -2091,12 +2245,6 @@ bad_tag: */ static void queue_con(struct ceph_connection *con) { - if (test_bit(DEAD, &con->state)) { - dout("queue_con %p ignoring: DEAD\n", - con); - return; - } - if (!con->ops->get(con)) { dout("queue_con %p ref count 0\n", con); return; @@ -2121,7 +2269,26 @@ static void con_work(struct work_struct *work) mutex_lock(&con->mutex); restart: - if (test_and_clear_bit(BACKOFF, &con->state)) { + if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) { + switch (con->state) { + case CON_STATE_CONNECTING: + con->error_msg = "connection failed"; + break; + case CON_STATE_NEGOTIATING: + con->error_msg = "negotiation failed"; + break; + case CON_STATE_OPEN: + con->error_msg = "socket closed"; + break; + default: + dout("unrecognized con state %d\n", (int)con->state); + con->error_msg = "unrecognized con state"; + BUG(); + } + goto fault; + } + + if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { dout("con_work %p backing off\n", con); if (queue_delayed_work(ceph_msgr_wq, &con->work, round_jiffies_relative(con->delay))) { @@ -2135,35 +2302,35 @@ restart: } } - if (test_bit(STANDBY, &con->state)) { + if (con->state == CON_STATE_STANDBY) { dout("con_work %p STANDBY\n", con); goto done; } - if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ - dout("con_work CLOSED\n"); - con_close_socket(con); + if (con->state == CON_STATE_CLOSED) { + dout("con_work %p CLOSED\n", con); + BUG_ON(con->sock); goto done; } - if (test_and_clear_bit(OPENING, &con->state)) { - /* reopen w/ new peer */ + if (con->state == CON_STATE_PREOPEN) { dout("con_work OPENING\n"); - con_close_socket(con); + BUG_ON(con->sock); } - if (test_and_clear_bit(SOCK_CLOSED, &con->state)) - goto fault; - ret = try_read(con); if (ret == -EAGAIN) goto restart; - if (ret < 0) + if (ret < 0) { + con->error_msg = "socket error on read"; goto fault; + } ret = try_write(con); if (ret == -EAGAIN) goto restart; - if (ret < 0) + if (ret < 0) { + con->error_msg = "socket error on write"; goto fault; + } done: mutex_unlock(&con->mutex); @@ -2172,7 +2339,6 @@ done_unlocked: return; fault: - mutex_unlock(&con->mutex); ceph_fault(con); /* error/fault path */ goto done_unlocked; } @@ -2183,26 +2349,31 @@ fault: * exponential backoff */ static void ceph_fault(struct ceph_connection *con) + __releases(con->mutex) { pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); - if (test_bit(LOSSYTX, &con->state)) { - dout("fault on LOSSYTX channel\n"); - goto out; - } - - mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state)) - goto out_unlock; + BUG_ON(con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN); con_close_socket(con); + if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { + dout("fault on LOSSYTX channel, marking CLOSED\n"); + con->state = CON_STATE_CLOSED; + goto out_unlock; + } + if (con->in_msg) { + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; + con->ops->put(con); } /* Requeue anything that hasn't been acked */ @@ -2211,12 +2382,13 @@ static void ceph_fault(struct ceph_connection *con) /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && - !test_bit(KEEPALIVE_PENDING, &con->state)) { + !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); - clear_bit(WRITE_PENDING, &con->state); - set_bit(STANDBY, &con->state); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con->state = CON_STATE_STANDBY; } else { /* retry after a delay. */ + con->state = CON_STATE_PREOPEN; if (con->delay == 0) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) @@ -2237,13 +2409,12 @@ static void ceph_fault(struct ceph_connection *con) * that when con_work restarts we schedule the * delay then. */ - set_bit(BACKOFF, &con->state); + set_bit(CON_FLAG_BACKOFF, &con->flags); } } out_unlock: mutex_unlock(&con->mutex); -out: /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. @@ -2260,18 +2431,14 @@ out: /* - * create a new messenger instance + * initialize a new messenger instance */ -struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features) +void ceph_messenger_init(struct ceph_messenger *msgr, + struct ceph_entity_addr *myaddr, + u32 supported_features, + u32 required_features, + bool nocrc) { - struct ceph_messenger *msgr; - - msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); - if (msgr == NULL) - return ERR_PTR(-ENOMEM); - msgr->supported_features = supported_features; msgr->required_features = required_features; @@ -2284,30 +2451,23 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, msgr->inst.addr.type = 0; get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); + msgr->nocrc = nocrc; - dout("messenger_create %p\n", msgr); - return msgr; -} -EXPORT_SYMBOL(ceph_messenger_create); + atomic_set(&msgr->stopping, 0); -void ceph_messenger_destroy(struct ceph_messenger *msgr) -{ - dout("destroy %p\n", msgr); - kfree(msgr); - dout("destroyed messenger %p\n", msgr); + dout("%s %p\n", __func__, msgr); } -EXPORT_SYMBOL(ceph_messenger_destroy); +EXPORT_SYMBOL(ceph_messenger_init); static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ - if (test_and_clear_bit(STANDBY, &con->state)) { - mutex_lock(&con->mutex); + if (con->state == CON_STATE_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); + con->state = CON_STATE_PREOPEN; con->connect_seq++; - WARN_ON(test_bit(WRITE_PENDING, &con->state)); - WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); - mutex_unlock(&con->mutex); + WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); + WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); } } @@ -2316,21 +2476,24 @@ static void clear_standby(struct ceph_connection *con) */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { - if (test_bit(CLOSED, &con->state)) { - dout("con_send %p closed, dropping %p\n", con, msg); - ceph_msg_put(msg); - return; - } - /* set src+dst */ msg->hdr.src = con->msgr->inst.name; - BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); - msg->needs_out_seq = true; - /* queue */ mutex_lock(&con->mutex); + + if (con->state == CON_STATE_CLOSED) { + dout("con_send %p closed, dropping %p\n", con, msg); + ceph_msg_put(msg); + mutex_unlock(&con->mutex); + return; + } + + BUG_ON(msg->con != NULL); + msg->con = con->ops->get(con); + BUG_ON(msg->con == NULL); + BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, @@ -2339,12 +2502,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); + + clear_standby(con); mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ - clear_standby(con); - if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) + if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); @@ -2352,24 +2516,34 @@ EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send */ -void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) +void ceph_msg_revoke(struct ceph_msg *msg) { + struct ceph_connection *con = msg->con; + + if (!con) + return; /* Message not in our possession */ + mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p - was on queue\n", con, msg); + dout("%s %p msg %p - was on queue\n", __func__, con, msg); list_del_init(&msg->list_head); - ceph_msg_put(msg); + BUG_ON(msg->con == NULL); + msg->con->ops->put(msg->con); + msg->con = NULL; msg->hdr.seq = 0; + + ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("con_revoke %p msg %p - was sending\n", con, msg); + dout("%s %p msg %p - was sending\n", __func__, con, msg); con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - ceph_msg_put(msg); msg->hdr.seq = 0; + + ceph_msg_put(msg); } mutex_unlock(&con->mutex); } @@ -2377,17 +2551,27 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) /* * Revoke a message that we may be reading data into */ -void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) +void ceph_msg_revoke_incoming(struct ceph_msg *msg) { + struct ceph_connection *con; + + BUG_ON(msg == NULL); + if (!msg->con) { + dout("%s msg %p null con\n", __func__, msg); + + return; /* Message not in our possession */ + } + + con = msg->con; mutex_lock(&con->mutex); - if (con->in_msg && con->in_msg == msg) { + if (con->in_msg == msg) { unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); /* skip rest of message */ - dout("con_revoke_pages %p msg %p revoked\n", con, msg); - con->in_base_pos = con->in_base_pos - + dout("%s %p msg %p revoked\n", __func__, con, msg); + con->in_base_pos = con->in_base_pos - sizeof(struct ceph_msg_header) - front_len - middle_len - @@ -2398,8 +2582,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; } else { - dout("con_revoke_pages %p msg %p pages %p no-op\n", - con, con->in_msg, msg); + dout("%s %p in_msg %p msg %p no-op\n", + __func__, con, con->in_msg, msg); } mutex_unlock(&con->mutex); } @@ -2410,9 +2594,11 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) void ceph_con_keepalive(struct ceph_connection *con) { dout("con_keepalive %p\n", con); + mutex_lock(&con->mutex); clear_standby(con); - if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && - test_and_set_bit(WRITE_PENDING, &con->state) == 0) + mutex_unlock(&con->mutex); + if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && + test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); @@ -2431,6 +2617,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, if (m == NULL) goto out; kref_init(&m->kref); + + m->con = NULL; INIT_LIST_HEAD(&m->list_head); m->hdr.tid = 0; @@ -2526,46 +2714,77 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) } /* - * Generic message allocator, for incoming messages. + * Allocate a message for receiving an incoming message on a + * connection, and save the result in con->in_msg. Uses the + * connection's private alloc_msg op if available. + * + * Returns 0 on success, or a negative error code. + * + * On success, if we set *skip = 1: + * - the next message should be skipped and ignored. + * - con->in_msg == NULL + * or if we set *skip = 0: + * - con->in_msg is non-null. + * On error (ENOMEM, EAGAIN, ...), + * - con->in_msg == NULL */ -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) { + struct ceph_msg_header *hdr = &con->in_hdr; int type = le16_to_cpu(hdr->type); int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); - struct ceph_msg *msg = NULL; - int ret; + int ret = 0; + + BUG_ON(con->in_msg != NULL); if (con->ops->alloc_msg) { + struct ceph_msg *msg; + mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (!msg || *skip) - return NULL; + if (con->state != CON_STATE_OPEN) { + ceph_msg_put(msg); + return -EAGAIN; + } + con->in_msg = msg; + if (con->in_msg) { + con->in_msg->con = con->ops->get(con); + BUG_ON(con->in_msg->con == NULL); + } + if (*skip) { + con->in_msg = NULL; + return 0; + } + if (!con->in_msg) { + con->error_msg = + "error allocating memory for incoming message"; + return -ENOMEM; + } } - if (!msg) { - *skip = 0; - msg = ceph_msg_new(type, front_len, GFP_NOFS, false); - if (!msg) { + if (!con->in_msg) { + con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!con->in_msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); - return NULL; + return -ENOMEM; } - msg->page_alignment = le16_to_cpu(hdr->data_off); + con->in_msg->con = con->ops->get(con); + BUG_ON(con->in_msg->con == NULL); + con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); } - memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); - if (middle_len && !msg->middle) { - ret = ceph_alloc_middle(con, msg); + if (middle_len && !con->in_msg->middle) { + ret = ceph_alloc_middle(con, con->in_msg); if (ret < 0) { - ceph_msg_put(msg); - return NULL; + ceph_msg_put(con->in_msg); + con->in_msg = NULL; } } - return msg; + return ret; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index d0649a9655be..105d533b55f3 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) monc->pending_auth = 1; monc->m_auth->front.iov_len = len; monc->m_auth->hdr.front_len = cpu_to_le32(len); - ceph_con_revoke(monc->con, monc->m_auth); + ceph_msg_revoke(monc->m_auth); ceph_msg_get(monc->m_auth); /* keep our ref */ - ceph_con_send(monc->con, monc->m_auth); + ceph_con_send(&monc->con, monc->m_auth); } /* @@ -117,8 +117,11 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) static void __close_session(struct ceph_mon_client *monc) { dout("__close_session closing mon%d\n", monc->cur_mon); - ceph_con_revoke(monc->con, monc->m_auth); - ceph_con_close(monc->con); + ceph_msg_revoke(monc->m_auth); + ceph_msg_revoke_incoming(monc->m_auth_reply); + ceph_msg_revoke(monc->m_subscribe); + ceph_msg_revoke_incoming(monc->m_subscribe_ack); + ceph_con_close(&monc->con); monc->cur_mon = -1; monc->pending_auth = 0; ceph_auth_reset(monc->auth); @@ -142,9 +145,8 @@ static int __open_session(struct ceph_mon_client *monc) monc->want_next_osdmap = !!monc->want_next_osdmap; dout("open_session mon%d opening\n", monc->cur_mon); - monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; - monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); - ceph_con_open(monc->con, + ceph_con_open(&monc->con, + CEPH_ENTITY_TYPE_MON, monc->cur_mon, &monc->monmap->mon_inst[monc->cur_mon].addr); /* initiatiate authentication handshake */ @@ -226,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_con_revoke(monc->con, msg); - ceph_con_send(monc->con, ceph_msg_get(msg)); + ceph_msg_revoke(msg); + ceph_con_send(&monc->con, ceph_msg_get(msg)); monc->sub_sent = jiffies | 1; /* never 0 */ } @@ -247,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, if (monc->hunting) { pr_info("mon%d %s session established\n", monc->cur_mon, - ceph_pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con.peer_addr.in_addr)); monc->hunting = false; } dout("handle_subscribe_ack after %d seconds\n", seconds); @@ -439,6 +441,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, m = NULL; } else { dout("get_generic_reply %lld got %p\n", tid, req->reply); + *skip = 0; m = ceph_msg_get(req->reply); /* * we don't need to track the connection reading into @@ -461,7 +464,7 @@ static int do_generic_request(struct ceph_mon_client *monc, req->request->hdr.tid = cpu_to_le64(req->tid); __insert_generic_request(monc, req); monc->num_generic_requests++; - ceph_con_send(monc->con, ceph_msg_get(req->request)); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); mutex_unlock(&monc->mutex); err = wait_for_completion_interruptible(&req->completion); @@ -684,8 +687,9 @@ static void __resend_generic_request(struct ceph_mon_client *monc) for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { req = rb_entry(p, struct ceph_mon_generic_request, node); - ceph_con_revoke(monc->con, req->request); - ceph_con_send(monc->con, ceph_msg_get(req->request)); + ceph_msg_revoke(req->request); + ceph_msg_revoke_incoming(req->reply); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); } } @@ -705,7 +709,7 @@ static void delayed_work(struct work_struct *work) __close_session(monc); __open_session(monc); /* continue hunting */ } else { - ceph_con_keepalive(monc->con); + ceph_con_keepalive(&monc->con); __validate_auth(monc); @@ -760,19 +764,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) goto out; /* connection */ - monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); - if (!monc->con) - goto out_monmap; - ceph_con_init(monc->client->msgr, monc->con); - monc->con->private = monc; - monc->con->ops = &mon_con_ops; - /* authentication */ monc->auth = ceph_auth_init(cl->options->name, cl->options->key); if (IS_ERR(monc->auth)) { err = PTR_ERR(monc->auth); - goto out_con; + goto out_monmap; } monc->auth->want_keys = CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | @@ -801,6 +798,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) if (!monc->m_auth) goto out_auth_reply; + ceph_con_init(&monc->con, monc, &mon_con_ops, + &monc->client->msgr); + monc->cur_mon = -1; monc->hunting = true; monc->sub_renew_after = jiffies; @@ -824,8 +824,6 @@ out_subscribe_ack: ceph_msg_put(monc->m_subscribe_ack); out_auth: ceph_auth_destroy(monc->auth); -out_con: - monc->con->ops->put(monc->con); out_monmap: kfree(monc->monmap); out: @@ -841,10 +839,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc) mutex_lock(&monc->mutex); __close_session(monc); - monc->con->private = NULL; - monc->con->ops->put(monc->con); - monc->con = NULL; - mutex_unlock(&monc->mutex); /* @@ -888,8 +882,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc, } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); - monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - monc->client->msgr->inst.name.num = + monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; + monc->client->msgr.inst.name.num = cpu_to_le64(monc->auth->global_id); __send_subscribe(monc); @@ -1000,6 +994,8 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: m = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!m) + return NULL; /* ENOMEM--return skip == 0 */ break; } @@ -1029,7 +1025,7 @@ static void mon_fault(struct ceph_connection *con) if (!monc->hunting) pr_info("mon%d %s session lost, " "hunting for new mon\n", monc->cur_mon, - ceph_pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con.peer_addr.in_addr)); __close_session(monc); if (!monc->hunting) { @@ -1044,9 +1040,23 @@ out: mutex_unlock(&monc->mutex); } +/* + * We can ignore refcounting on the connection struct, as all references + * will come from the messenger workqueue, which is drained prior to + * mon_client destruction. + */ +static struct ceph_connection *con_get(struct ceph_connection *con) +{ + return con; +} + +static void con_put(struct ceph_connection *con) +{ +} + static const struct ceph_connection_operations mon_con_ops = { - .get = ceph_con_get, - .put = ceph_con_put, + .get = con_get, + .put = con_put, .dispatch = dispatch, .fault = mon_fault, .alloc_msg = mon_alloc_msg, diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index 11d5f4196a73..ddec1c10ac80 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) struct ceph_msgpool *pool = arg; struct ceph_msg *msg; - msg = ceph_msg_new(0, pool->front_len, gfp_mask, true); + msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); if (!msg) { dout("msgpool_alloc %s failed\n", pool->name); } else { @@ -32,10 +32,11 @@ static void msgpool_free(void *element, void *arg) ceph_msg_put(msg); } -int ceph_msgpool_init(struct ceph_msgpool *pool, +int ceph_msgpool_init(struct ceph_msgpool *pool, int type, int front_len, int size, bool blocking, const char *name) { dout("msgpool %s init\n", name); + pool->type = type; pool->front_len = front_len; pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); if (!pool->pool) @@ -61,7 +62,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, WARN_ON(1); /* try to alloc a fresh message */ - return ceph_msg_new(0, front_len, GFP_NOFS, false); + return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); } msg = mempool_alloc(pool->pool, GFP_NOFS); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ca59e66c9787..42119c05e82c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -140,10 +140,9 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_request) ceph_msg_put(req->r_request); if (req->r_con_filling_msg) { - dout("release_request revoking pages %p from con %p\n", + dout("%s revoking pages %p from con %p\n", __func__, req->r_pages, req->r_con_filling_msg); - ceph_con_revoke_message(req->r_con_filling_msg, - req->r_reply); + ceph_msg_revoke_incoming(req->r_reply); req->r_con_filling_msg->ops->put(req->r_con_filling_msg); } if (req->r_reply) @@ -214,10 +213,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); + rb_init_node(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); INIT_LIST_HEAD(&req->r_req_lru_item); + INIT_LIST_HEAD(&req->r_osd_item); + req->r_flags = flags; WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); @@ -243,6 +245,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } ceph_pagelist_init(req->r_trail); } + /* create request message; allow space for oid */ msg_size += MAX_OBJ_NAME_SIZE; if (snapc) @@ -256,7 +259,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, return NULL; } - msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; @@ -624,7 +626,7 @@ static void osd_reset(struct ceph_connection *con) /* * Track open sessions with osds. */ -static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) +static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) { struct ceph_osd *osd; @@ -634,15 +636,13 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) atomic_set(&osd->o_ref, 1); osd->o_osdc = osdc; + osd->o_osd = onum; INIT_LIST_HEAD(&osd->o_requests); INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_osd_lru); osd->o_incarnation = 1; - ceph_con_init(osdc->client->msgr, &osd->o_con); - osd->o_con.private = osd; - osd->o_con.ops = &osd_con_ops; - osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; + ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); INIT_LIST_HEAD(&osd->o_keepalive_item); return osd; @@ -688,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) static void remove_all_osds(struct ceph_osd_client *osdc) { - dout("__remove_old_osds %p\n", osdc); + dout("%s %p\n", __func__, osdc); mutex_lock(&osdc->request_mutex); while (!RB_EMPTY_ROOT(&osdc->osds)) { struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), @@ -752,7 +752,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) ret = -EAGAIN; } else { ceph_con_close(&osd->o_con); - ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); + ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, + &osdc->osdmap->osd_addr[osd->o_osd]); osd->o_incarnation++; } return ret; @@ -853,7 +854,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, if (req->r_osd) { /* make sure the original request isn't in flight. */ - ceph_con_revoke(&req->r_osd->o_con, req->r_request); + ceph_msg_revoke(req->r_request); list_del_init(&req->r_osd_item); if (list_empty(&req->r_osd->o_requests) && @@ -880,7 +881,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, static void __cancel_request(struct ceph_osd_request *req) { if (req->r_sent && req->r_osd) { - ceph_con_revoke(&req->r_osd->o_con, req->r_request); + ceph_msg_revoke(req->r_request); req->r_sent = 0; } } @@ -890,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc, { dout("__register_linger_request %p\n", req); list_add_tail(&req->r_linger_item, &osdc->req_linger); - list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); + if (req->r_osd) + list_add_tail(&req->r_linger_osd, + &req->r_osd->o_linger_requests); } static void __unregister_linger_request(struct ceph_osd_client *osdc, @@ -998,18 +1001,18 @@ static int __map_request(struct ceph_osd_client *osdc, req->r_osd = __lookup_osd(osdc, o); if (!req->r_osd && o >= 0) { err = -ENOMEM; - req->r_osd = create_osd(osdc); + req->r_osd = create_osd(osdc, o); if (!req->r_osd) { list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; } dout("map_request osd %p is osd%d\n", req->r_osd, o); - req->r_osd->o_osd = o; - req->r_osd->o_con.peer_name.num = cpu_to_le64(o); __insert_osd(osdc, req->r_osd); - ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); + ceph_con_open(&req->r_osd->o_con, + CEPH_ENTITY_TYPE_OSD, o, + &osdc->osdmap->osd_addr[o]); } if (req->r_osd) { @@ -1304,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + for (p = rb_first(&osdc->requests); p; ) { req = rb_entry(p, struct ceph_osd_request, r_node); + p = rb_next(p); err = __map_request(osdc, req, force_resend); if (err < 0) continue; /* error */ @@ -1313,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("%p tid %llu maps to no osd\n", req, req->r_tid); needmap++; /* request a newer map */ } else if (err > 0) { - dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - if (!req->r_linger) + if (!req->r_linger) { + dout("%p tid %llu requeued on osd%d\n", req, + req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + } + if (req->r_linger && list_empty(&req->r_linger_item)) { + /* + * register as a linger so that we will + * re-submit below and get a new tid + */ + dout("%p tid %llu restart on osd%d\n", + req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + __register_linger_request(osdc, req); + __unregister_request(osdc, req); } } @@ -1391,7 +1408,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) epoch, maplen); newmap = osdmap_apply_incremental(&p, next, osdc->osdmap, - osdc->client->msgr); + &osdc->client->msgr); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; @@ -1839,11 +1856,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->req_mempool) goto out; - err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, + err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, + OSD_OP_FRONT_LEN, 10, true, "osd_op"); if (err < 0) goto out_mempool; - err = ceph_msgpool_init(&osdc->msgpool_op_reply, + err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, OSD_OPREPLY_FRONT_LEN, 10, true, "osd_op_reply"); if (err < 0) @@ -2019,15 +2037,15 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, if (!req) { *skip = 1; m = NULL; - pr_info("get_reply unknown tid %llu from osd%d\n", tid, - osd->o_osd); + dout("get_reply unknown tid %llu from osd%d\n", tid, + osd->o_osd); goto out; } if (req->r_con_filling_msg) { - dout("get_reply revoking msg %p from old con %p\n", + dout("%s revoking msg %p from old con %p\n", __func__, req->r_reply, req->r_con_filling_msg); - ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); + ceph_msg_revoke_incoming(req->r_reply); req->r_con_filling_msg->ops->put(req->r_con_filling_msg); req->r_con_filling_msg = NULL; } @@ -2080,6 +2098,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, int type = le16_to_cpu(hdr->type); int front = le32_to_cpu(hdr->front_len); + *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: case CEPH_MSG_WATCH_NOTIFY: diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 81e3b84a77ef..3124b71a8883 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -135,6 +135,21 @@ bad: return -EINVAL; } +static int skip_name_map(void **p, void *end) +{ + int len; + ceph_decode_32_safe(p, end, len ,bad); + while (len--) { + int strlen; + *p += sizeof(u32); + ceph_decode_32_safe(p, end, strlen, bad); + *p += strlen; +} + return 0; +bad: + return -EINVAL; +} + static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) void **p = &pbyval; void *start = pbyval; u32 magic; + u32 num_name_maps; dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (c == NULL) return ERR_PTR(-ENOMEM); + /* set tunables to default values */ + c->choose_local_tries = 2; + c->choose_local_fallback_tries = 5; + c->choose_total_tries = 19; + ceph_decode_need(p, end, 4*sizeof(u32), bad); magic = ceph_decode_32(p); if (magic != CRUSH_MAGIC) { @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end) } /* ignore trailing name maps. */ + for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { + err = skip_name_map(p, end); + if (err < 0) + goto done; + } + + /* tunables */ + ceph_decode_need(p, end, 3*sizeof(u32), done); + c->choose_local_tries = ceph_decode_32(p); + c->choose_local_fallback_tries = ceph_decode_32(p); + c->choose_total_tries = ceph_decode_32(p); + dout("crush decode tunable choose_local_tries = %d", + c->choose_local_tries); + dout("crush decode tunable choose_local_fallback_tries = %d", + c->choose_local_fallback_tries); + dout("crush decode tunable choose_total_tries = %d", + c->choose_total_tries); +done: dout("crush_decode success\n"); return c; @@ -488,15 +527,16 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) ceph_decode_32_safe(p, end, pool, bad); ceph_decode_32_safe(p, end, len, bad); dout(" pool %d len %d\n", pool, len); + ceph_decode_need(p, end, len, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) { + char *name = kstrndup(*p, len, GFP_NOFS); + + if (!name) + return -ENOMEM; kfree(pi->name); - pi->name = kmalloc(len + 1, GFP_NOFS); - if (pi->name) { - memcpy(pi->name, *p, len); - pi->name[len] = '\0'; - dout(" name is %s\n", pi->name); - } + pi->name = name; + dout(" name is %s\n", pi->name); } *p += len; } @@ -666,6 +706,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); ceph_decode_copy(p, &pgid, sizeof(pgid)); n = ceph_decode_32(p); + err = -EINVAL; + if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + goto bad; ceph_decode_need(p, end, n * sizeof(u32), bad); err = -ENOMEM; pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); @@ -889,6 +932,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, (void) __remove_pg_mapping(&map->pg_temp, pgid); /* insert */ + if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) { + err = -EINVAL; + goto bad; + } pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); if (!pg) { err = -ENOMEM; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 47ad2666fdf6..2afd2a84dc35 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1349,8 +1349,11 @@ static int c_show(struct seq_file *m, void *p) if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ seq_printf(m, "# "); - else + else { + if (cache_is_expired(cd, cp)) + seq_printf(m, "# "); cache_put(cp, cd); + } return cd->cache_show(m, cd, cp); } diff --git a/scripts/sortextable.c b/scripts/sortextable.c index 1ca9ceb95eb6..6acf83449105 100644 --- a/scripts/sortextable.c +++ b/scripts/sortextable.c @@ -247,6 +247,7 @@ do_file(char const *const fname) case EM_X86_64: custom_sort = sort_x86_table; break; + case EM_S390: case EM_MIPS: break; } /* end switch */ |