diff -urNp linux-2.6.32.48/arch/x86/ia32/ia32entry.S linux-2.6.32.48-openvz/arch/x86/ia32/ia32entry.S --- linux-2.6.32.48/arch/x86/ia32/ia32entry.S 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/ia32/ia32entry.S 2011-11-21 17:40:44.000000000 -0500 @@ -623,7 +623,7 @@ ia32_sys_call_table: .quad stub32_iopl /* 110 */ .quad sys_vhangup .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ + .quad quiet_ni_syscall /* vm86old */ .quad compat_sys_wait4 .quad sys_swapoff /* 115 */ .quad compat_sys_sysinfo @@ -676,7 +676,7 @@ ia32_sys_call_table: .quad sys_mremap .quad sys_setresuid16 .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ + .quad quiet_ni_syscall /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll .quad compat_sys_nfsservctl @@ -847,4 +847,25 @@ ia32_sys_call_table: .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open + .rept 500-(.-ia32_sys_call_table)/8 + .quad sys_ni_syscall + .endr + .quad sys_fairsched_mknod /* 500 */ + .quad sys_fairsched_rmnod + .quad sys_fairsched_chwt + .quad sys_fairsched_mvpr + .quad sys_fairsched_rate + .quad sys_fairsched_vcpus /* 505 */ + .quad sys_ni_syscall + .quad sys_ni_syscall + .quad sys_ni_syscall + .quad sys_ni_syscall + .quad sys_getluid /* 510 */ + .quad sys_setluid + .quad compat_sys_setublimit + .quad compat_sys_ubstat + .quad sys_ni_syscall + .quad sys_ni_syscall /* 515 */ + .quad sys_lchmod + .quad compat_sys_lutime ia32_syscall_end: diff -urNp linux-2.6.32.48/arch/x86/ia32/sys_ia32.c linux-2.6.32.48-openvz/arch/x86/ia32/sys_ia32.c --- linux-2.6.32.48/arch/x86/ia32/sys_ia32.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/ia32/sys_ia32.c 2011-11-21 17:40:44.000000000 -0500 @@ -623,20 +623,6 @@ long sys32_fadvise64_64(int fd, __u32 of advice); } -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO - "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, char __user *buf, size_t len) { diff -urNp linux-2.6.32.48/arch/x86/include/asm/elf.h linux-2.6.32.48-openvz/arch/x86/include/asm/elf.h --- linux-2.6.32.48/arch/x86/include/asm/elf.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/include/asm/elf.h 2011-11-21 17:40:44.000000000 -0500 @@ -285,7 +285,7 @@ struct task_struct; #define ARCH_DLINFO_IA32(vdso_enabled) \ do { \ - if (vdso_enabled) { \ + if (vdso_enabled && sysctl_at_vsyscall) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ @@ -332,9 +332,11 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); + int uses_interp, + unsigned long map_address); -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); +extern int syscall32_setup_pages(struct linux_binprm *, int exstack, + unsigned long map_address); #define compat_arch_setup_additional_pages syscall32_setup_pages extern unsigned long arch_randomize_brk(struct mm_struct *mm); diff -urNp linux-2.6.32.48/arch/x86/include/asm/pgalloc.h linux-2.6.32.48-openvz/arch/x86/include/asm/pgalloc.h --- linux-2.6.32.48/arch/x86/include/asm/pgalloc.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/include/asm/pgalloc.h 2011-11-21 17:40:44.000000000 -0500 @@ -80,7 +80,7 @@ static inline void pmd_populate(struct m #if PAGETABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) @@ -116,7 +116,7 @@ static inline void pgd_populate(struct m static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) diff -urNp linux-2.6.32.48/arch/x86/include/asm/processor.h linux-2.6.32.48-openvz/arch/x86/include/asm/processor.h --- linux-2.6.32.48/arch/x86/include/asm/processor.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/include/asm/processor.h 2011-11-21 17:40:44.000000000 -0500 @@ -974,8 +974,7 @@ extern unsigned long thread_saved_pc(str /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ - 0xc0000000 : 0xFFFFe000) +#define IA32_PAGE_OFFSET 0xc0000000 #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) diff -urNp linux-2.6.32.48/arch/x86/include/asm/thread_info.h linux-2.6.32.48-openvz/arch/x86/include/asm/thread_info.h --- linux-2.6.32.48/arch/x86/include/asm/thread_info.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/include/asm/thread_info.h 2011-11-21 17:40:44.000000000 -0500 @@ -95,6 +95,7 @@ struct thread_info { #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#define TIF_RESUME 29 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -117,6 +118,7 @@ struct thread_info { #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_RESUME (1< #include #include +#include #include #include #include "cpu.h" diff -urNp linux-2.6.32.48/arch/x86/kernel/dumpstack_32.c linux-2.6.32.48-openvz/arch/x86/kernel/dumpstack_32.c --- linux-2.6.32.48/arch/x86/kernel/dumpstack_32.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/dumpstack_32.c 2011-11-21 17:40:44.000000000 -0500 @@ -105,8 +105,9 @@ void show_registers(struct pt_regs *regs print_modules(); __show_regs(regs, 0); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + printk(KERN_EMERG "Process %.*s (pid: %d, veid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), + VEID(current->ve_task_info.owner_env), current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the diff -urNp linux-2.6.32.48/arch/x86/kernel/dumpstack_64.c linux-2.6.32.48-openvz/arch/x86/kernel/dumpstack_64.c --- linux-2.6.32.48/arch/x86/kernel/dumpstack_64.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/dumpstack_64.c 2011-11-21 17:40:44.000000000 -0500 @@ -254,8 +254,10 @@ void show_registers(struct pt_regs *regs sp = regs->sp; printk("CPU %d ", cpu); __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, + VEID(VE_TASK_INFO(current)->owner_env), + task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the diff -urNp linux-2.6.32.48/arch/x86/kernel/dumpstack.c linux-2.6.32.48-openvz/arch/x86/kernel/dumpstack.c --- linux-2.6.32.48/arch/x86/kernel/dumpstack.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/dumpstack.c 2011-11-21 17:40:44.000000000 -0500 @@ -320,6 +320,7 @@ die_nmi(char *str, struct pt_regs *regs, printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); + nmi_show_regs(regs, 1); oops_end(flags, regs, 0); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); diff -urNp linux-2.6.32.48/arch/x86/kernel/entry_32.S linux-2.6.32.48-openvz/arch/x86/kernel/entry_32.S --- linux-2.6.32.48/arch/x86/kernel/entry_32.S 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/entry_32.S 2011-11-21 17:40:44.000000000 -0500 @@ -325,6 +325,7 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%ebp) popl %eax CFI_ADJUST_CFA_OFFSET -4 +ret_from_fork_tail: pushl $0x0202 # Reset kernel eflags CFI_ADJUST_CFA_OFFSET 4 popfl @@ -333,6 +334,25 @@ ENTRY(ret_from_fork) CFI_ENDPROC END(ret_from_fork) +ENTRY(i386_ret_from_resume) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + movl (%esp),%eax + testl %eax,%eax + jz 1f + pushl %esp + call *%eax + addl $4,%esp +1: + addl $256,%esp + jmp ret_from_fork_tail + CFI_ENDPROC + /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to diff -urNp linux-2.6.32.48/arch/x86/kernel/entry_64.S linux-2.6.32.48-openvz/arch/x86/kernel/entry_64.S --- linux-2.6.32.48/arch/x86/kernel/entry_64.S 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/entry_64.S 2011-11-21 17:40:44.000000000 -0500 @@ -405,8 +405,12 @@ ENTRY(ret_from_fork) call schedule_tail # rdi: 'prev' task parameter +ret_from_fork_tail: GET_THREAD_INFO(%rcx) + btr $TIF_RESUME,TI_flags(%rcx) + jc x86_64_ret_from_resume +ret_from_fork_check: RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? @@ -418,6 +422,18 @@ ENTRY(ret_from_fork) RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET jmp ret_from_sys_call # go to the SYSRET fastpath +x86_64_ret_from_resume: + movq (%rsp),%rax + testq %rax,%rax + jz 1f + movq %rsp,%rdi + call *%rax +1: + addq $256,%rsp + cmpq $0,ORIG_RAX(%rsp) + jge ret_from_fork_tail + RESTORE_REST + jmp int_ret_from_sys_call CFI_ENDPROC END(ret_from_fork) @@ -1182,7 +1198,7 @@ ENTRY(kernel_thread) xorl %r9d,%r9d # clone now - call do_fork + call do_fork_kthread movq %rax,RAX(%rsp) xorl %edi,%edi diff -urNp linux-2.6.32.48/arch/x86/kernel/i387.c linux-2.6.32.48-openvz/arch/x86/kernel/i387.c --- linux-2.6.32.48/arch/x86/kernel/i387.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/i387.c 2011-11-21 17:40:44.000000000 -0500 @@ -163,6 +163,7 @@ int init_fpu(struct task_struct *tsk) set_stopped_child_used_math(tsk); return 0; } +EXPORT_SYMBOL(init_fpu); int fpregs_active(struct task_struct *target, const struct user_regset *regset) { diff -urNp linux-2.6.32.48/arch/x86/kernel/ldt.c linux-2.6.32.48-openvz/arch/x86/kernel/ldt.c --- linux-2.6.32.48/arch/x86/kernel/ldt.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/ldt.c 2011-11-21 17:40:44.000000000 -0500 @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include @@ -39,9 +41,9 @@ static int alloc_ldt(mm_context_t *pc, i mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + newldt = ub_vmalloc(mincount * LDT_ENTRY_SIZE); else - newldt = (void *)__get_free_page(GFP_KERNEL); + newldt = (void *)__get_free_page(GFP_KERNEL_UBC); if (!newldt) return -ENOMEM; @@ -117,6 +119,7 @@ int init_new_context(struct task_struct } return retval; } +EXPORT_SYMBOL_GPL(init_new_context); /* * No need to lock the MM as we are the last user diff -urNp linux-2.6.32.48/arch/x86/kernel/process_32.c linux-2.6.32.48-openvz/arch/x86/kernel/process_32.c --- linux-2.6.32.48/arch/x86/kernel/process_32.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/process_32.c 2011-11-21 17:40:44.000000000 -0500 @@ -40,6 +40,8 @@ #include #include #include +#include +#include #include #include @@ -60,6 +62,9 @@ #include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +EXPORT_SYMBOL(ret_from_fork); +asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume"); +EXPORT_SYMBOL_GPL(i386_ret_from_resume); /* * Return saved PC of a blocked thread. @@ -144,16 +149,17 @@ void __show_regs(struct pt_regs *regs, i board = dmi_get_system_info(DMI_PRODUCT_NAME); if (!board) board = ""; - printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", + printk("Pid: %d, comm: %s %s (%s %.*s) %s %s)\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + init_utsname()->version, VZVERSION, board); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->ip); + if (decode_call_traces) + print_symbol("EIP is at %s\n", regs->ip); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); @@ -189,6 +195,8 @@ void show_regs(struct pt_regs *regs) { __show_regs(regs, 1); show_trace(NULL, regs, ®s->sp, regs->bp); + if (!decode_call_traces) + printk(" EIP: [<%08lx>]\n", regs->ip); } /* @@ -197,6 +205,7 @@ void show_regs(struct pt_regs *regs) * the "args". */ extern void kernel_thread_helper(void); +EXPORT_SYMBOL(kernel_thread_helper); /* * Create a kernel thread @@ -205,6 +214,13 @@ int kernel_thread(int (*fn)(void *), voi { struct pt_regs regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.bx = (unsigned long) fn; diff -urNp linux-2.6.32.48/arch/x86/kernel/process_64.c linux-2.6.32.48-openvz/arch/x86/kernel/process_64.c --- linux-2.6.32.48/arch/x86/kernel/process_64.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/process_64.c 2011-11-21 17:40:44.000000000 -0500 @@ -25,8 +25,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -53,8 +55,6 @@ #include #include -asmlinkage extern void ret_from_fork(void); - DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); @@ -169,13 +169,14 @@ void __show_regs(struct pt_regs *regs, i board = dmi_get_system_info(DMI_PRODUCT_NAME); if (!board) board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s %s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + init_utsname()->version, VZVERSION, board); printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); - printk_address(regs->ip, 1); + if (decode_call_traces) + printk_address(regs->ip, 1); printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", @@ -228,7 +229,9 @@ void show_regs(struct pt_regs *regs) { printk(KERN_INFO "CPU %d:", smp_processor_id()); __show_regs(regs, 1); - show_trace(NULL, regs, (void *)(regs + 1), regs->bp); + show_trace(NULL, regs, ®s->sp, regs->bp); + if (!decode_call_traces) + printk(" EIP: [<%08lx>]\n", regs->ip); } void release_thread(struct task_struct *dead_task) @@ -679,3 +682,20 @@ unsigned long KSTK_ESP(struct task_struc return (test_tsk_thread_flag(task, TIF_IA32)) ? (task_pt_regs(task)->sp) : ((task)->thread.usersp); } + +long do_fork_kthread(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + if (ve_allow_kthreads || ve_is_super(get_exec_env())) + return do_fork(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr); + + /* Don't allow kernel_thread() inside VE */ + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; +} diff -urNp linux-2.6.32.48/arch/x86/kernel/signal.c linux-2.6.32.48-openvz/arch/x86/kernel/signal.c --- linux-2.6.32.48/arch/x86/kernel/signal.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/signal.c 2011-11-21 17:40:44.000000000 -0500 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -792,6 +793,9 @@ static void do_signal(struct pt_regs *re if (!user_mode(regs)) return; + if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + if (current_thread_info()->status & TS_RESTORE_SIGMASK) oldset = ¤t->saved_sigmask; else @@ -821,6 +825,7 @@ static void do_signal(struct pt_regs *re return; } +no_signal: /* Did we come from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ diff -urNp linux-2.6.32.48/arch/x86/kernel/smpboot.c linux-2.6.32.48-openvz/arch/x86/kernel/smpboot.c --- linux-2.6.32.48/arch/x86/kernel/smpboot.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/smpboot.c 2011-11-21 17:40:44.000000000 -0500 @@ -758,6 +758,12 @@ do_rest: initial_code = (unsigned long)start_secondary; stack_start.sp = (void *) c_idle.idle->thread.sp; +#ifdef CONFIG_VE + /* Cosmetic: sleep_time won't be changed afterwards for the idle + * thread; keep it 0 rather than -cycles. */ + VE_TASK_INFO(c_idle.idle)->sleep_time = 0; +#endif + /* start_ip had better be page-aligned! */ start_ip = setup_trampoline(); diff -urNp linux-2.6.32.48/arch/x86/kernel/smp.c linux-2.6.32.48-openvz/arch/x86/kernel/smp.c --- linux-2.6.32.48/arch/x86/kernel/smp.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/smp.c 2011-11-21 17:40:44.000000000 -0500 @@ -224,6 +224,11 @@ void smp_call_function_single_interrupt( irq_exit(); } +void send_nmi_ipi_allbutself(void) +{ + apic->send_IPI_allbutself(NMI_VECTOR); +} + struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, diff -urNp linux-2.6.32.48/arch/x86/kernel/syscall_table_32.S linux-2.6.32.48-openvz/arch/x86/kernel/syscall_table_32.S --- linux-2.6.32.48/arch/x86/kernel/syscall_table_32.S 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/syscall_table_32.S 2011-11-21 17:40:44.000000000 -0500 @@ -336,3 +336,24 @@ ENTRY(sys_call_table) .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open + .rept 500-(.-sys_call_table)/4 + .long sys_ni_syscall + .endr + .long sys_fairsched_mknod /* 500 */ + .long sys_fairsched_rmnod + .long sys_fairsched_chwt + .long sys_fairsched_mvpr + .long sys_fairsched_rate + .long sys_fairsched_vcpus /* 505 */ + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_getluid /* 510 */ + .long sys_setluid + .long sys_setublimit + .long sys_ubstat + .long sys_ni_syscall + .long sys_ni_syscall /* 515 */ + .long sys_lchmod + .long sys_lutime diff -urNp linux-2.6.32.48/arch/x86/kernel/traps.c linux-2.6.32.48-openvz/arch/x86/kernel/traps.c --- linux-2.6.32.48/arch/x86/kernel/traps.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/traps.c 2011-11-21 17:40:44.000000000 -0500 @@ -405,7 +405,8 @@ static notrace __kprobes void default_do * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog_tick(regs, reason)) + if (nmi_watchdog_tick(regs, reason) + + do_nmi_show_regs(regs, cpu)) return; if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); diff -urNp linux-2.6.32.48/arch/x86/kernel/tsc_sync.c linux-2.6.32.48-openvz/arch/x86/kernel/tsc_sync.c --- linux-2.6.32.48/arch/x86/kernel/tsc_sync.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/tsc_sync.c 2011-11-21 17:40:45.000000000 -0500 @@ -150,6 +150,10 @@ void __cpuinit check_tsc_sync_source(int printk(" passed.\n"); } +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif /* * Reset it - just in case we boot another CPU later: */ diff -urNp linux-2.6.32.48/arch/x86/kernel/x8664_ksyms_64.c linux-2.6.32.48-openvz/arch/x86/kernel/x8664_ksyms_64.c --- linux-2.6.32.48/arch/x86/kernel/x8664_ksyms_64.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/kernel/x8664_ksyms_64.c 2011-11-21 17:40:45.000000000 -0500 @@ -3,6 +3,7 @@ #include #include +#include #include @@ -17,6 +18,7 @@ EXPORT_SYMBOL(mcount); #endif +EXPORT_SYMBOL(kernel_execve); EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(__get_user_1); diff -urNp linux-2.6.32.48/arch/x86/mm/fault.c linux-2.6.32.48-openvz/arch/x86/mm/fault.c --- linux-2.6.32.48/arch/x86/mm/fault.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/mm/fault.c 2011-11-21 17:40:45.000000000 -0500 @@ -689,7 +689,7 @@ show_signal_msg(struct pt_regs *regs, un if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); @@ -918,7 +918,7 @@ spurious_fault(unsigned long error_code, return ret; } -int show_unhandled_signals = 1; +int show_unhandled_signals = 0; static inline int access_error(unsigned long error_code, int write, struct vm_area_struct *vma) diff -urNp linux-2.6.32.48/arch/x86/mm/hugetlbpage.c linux-2.6.32.48-openvz/arch/x86/mm/hugetlbpage.c --- linux-2.6.32.48/arch/x86/mm/hugetlbpage.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/mm/hugetlbpage.c 2011-11-21 17:40:45.000000000 -0500 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -230,6 +231,7 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } +EXPORT_SYMBOL(pmd_huge); struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, diff -urNp linux-2.6.32.48/arch/x86/mm/pgtable.c linux-2.6.32.48-openvz/arch/x86/mm/pgtable.c --- linux-2.6.32.48/arch/x86/mm/pgtable.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/mm/pgtable.c 2011-11-21 17:40:45.000000000 -0500 @@ -4,7 +4,8 @@ #include #include -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO | __GFP_UBC +#define PGALLOC_KERN_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -16,7 +17,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(PGALLOC_GFP); + return (pte_t *)__get_free_page(PGALLOC_KERN_GFP); } pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) diff -urNp linux-2.6.32.48/arch/x86/mm/tlb.c linux-2.6.32.48-openvz/arch/x86/mm/tlb.c --- linux-2.6.32.48/arch/x86/mm/tlb.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/mm/tlb.c 2011-11-21 17:40:45.000000000 -0500 @@ -256,6 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_mm); + void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; diff -urNp linux-2.6.32.48/arch/x86/vdso/vdso32-setup.c linux-2.6.32.48-openvz/arch/x86/vdso/vdso32-setup.c --- linux-2.6.32.48/arch/x86/vdso/vdso32-setup.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/vdso/vdso32-setup.c 2011-11-21 17:40:45.000000000 -0500 @@ -17,6 +17,8 @@ #include #include +#include + #include #include #include @@ -37,6 +39,8 @@ enum { #else #define VDSO_DEFAULT VDSO_ENABLED #endif +#undef VDSO_DEFAULT +#define VDSO_DEFAULT VDSO_DISABLED #ifdef CONFIG_X86_64 #define vdso_enabled sysctl_vsyscall32 @@ -193,7 +197,8 @@ static __init void relocate_vdso(Elf32_E } } -static struct page *vdso32_pages[1]; +struct page *vdso32_pages[1]; +EXPORT_SYMBOL_GPL(vdso32_pages); #ifdef CONFIG_X86_64 @@ -309,16 +314,30 @@ int __init sysenter_setup(void) return 0; } +EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN); +EXPORT_SYMBOL_GPL(VDSO32_PRELINK); + /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, + unsigned long map_address) { struct mm_struct *mm = current->mm; - unsigned long addr; + unsigned long addr = map_address; int ret = 0; bool compat; + unsigned long flags; - if (vdso_enabled == VDSO_DISABLED) + if (vdso_enabled == VDSO_DISABLED && map_address == 0) { + current->mm->context.vdso = NULL; return 0; + } + + flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | + mm->def_flags; + + ret = -ENOMEM; + if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT)) + goto err_charge; down_write(&mm->mmap_sem); @@ -328,19 +347,18 @@ int arch_setup_additional_pages(struct l map_compat_vdso(compat); - if (compat) - addr = VDSO_HIGH_BASE; - else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (!compat || map_address) { + addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - } + } else + addr = VDSO_HIGH_BASE; current->mm->context.vdso = (void *)addr; - if (compat_uses_vma || !compat) { + if (compat_uses_vma || !compat || map_address) { /* * MAYWRITE to allow gdb to COW and set breakpoints * @@ -368,9 +386,13 @@ int arch_setup_additional_pages(struct l current->mm->context.vdso = NULL; up_write(&mm->mmap_sem); + if (ret < 0) + ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL); +err_charge: return ret; } +EXPORT_SYMBOL_GPL(arch_setup_additional_pages); #ifdef CONFIG_X86_64 diff -urNp linux-2.6.32.48/arch/x86/vdso/vma.c linux-2.6.32.48-openvz/arch/x86/vdso/vma.c --- linux-2.6.32.48/arch/x86/vdso/vma.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/arch/x86/vdso/vma.c 2011-11-21 17:40:45.000000000 -0500 @@ -4,6 +4,7 @@ * Subject to the GPL, v.2 */ #include +#include #include #include #include @@ -99,17 +100,23 @@ static unsigned long vdso_addr(unsigned /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, + unsigned long map_address) { struct mm_struct *mm = current->mm; unsigned long addr; int ret; - if (!vdso_enabled) + if (!vdso_enabled && map_address == 0) { + current->mm->context.vdso = NULL; return 0; + } down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, vdso_size); + if (map_address) + addr = map_address; + else + addr = vdso_addr(mm->start_stack, vdso_size); addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; @@ -132,6 +139,7 @@ up_fail: up_write(&mm->mmap_sem); return ret; } +EXPORT_SYMBOL_GPL(arch_setup_additional_pages); static __init int vdso_setup(char *s) { diff -urNp linux-2.6.32.48/block/blk-cgroup.c linux-2.6.32.48-openvz/block/blk-cgroup.c --- linux-2.6.32.48/block/blk-cgroup.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/block/blk-cgroup.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,366 @@ +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ +#include +#include +#include +#include +#include +#include "blk-cgroup.h" + +static DEFINE_SPINLOCK(blkio_list_lock); +static LIST_HEAD(blkio_list); + +struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; +EXPORT_SYMBOL_GPL(blkio_root_cgroup); + +bool blkiocg_css_tryget(struct blkio_cgroup *blkcg) +{ + if (!css_tryget(&blkcg->css)) + return false; + return true; +} +EXPORT_SYMBOL_GPL(blkiocg_css_tryget); + +void blkiocg_css_put(struct blkio_cgroup *blkcg) +{ + css_put(&blkcg->css); +} +EXPORT_SYMBOL_GPL(blkiocg_css_put); + +struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkio_cgroup, css); +} +EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); + +void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors) +{ + blkg->time += time; + blkg->sectors += sectors; +} +EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); + +void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) +{ + unsigned long flags; + + spin_lock_irqsave(&blkcg->lock, flags); + rcu_assign_pointer(blkg->key, key); + blkg->blkcg_id = css_id(&blkcg->css); + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + spin_unlock_irqrestore(&blkcg->lock, flags); +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Need to take css reference ? */ + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); +#endif + blkg->dev = dev; +} +EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); + +static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + hlist_del_init_rcu(&blkg->blkcg_node); + blkg->blkcg_id = 0; +} + +/* + * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 + * indicating that blk_group was unhashed by the time we got to it. + */ +int blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + struct blkio_cgroup *blkcg; + unsigned long flags; + struct cgroup_subsys_state *css; + int ret = 1; + + rcu_read_lock(); + css = css_lookup(&blkio_subsys, blkg->blkcg_id); + if (!css) + goto out; + + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); + +/* called under rcu_read_lock(). */ +struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +{ + struct blkio_group *blkg; + struct hlist_node *n; + void *__key; + + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + __key = blkg->key; + if (__key == key) + return blkg; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(blkiocg_lookup_group); + +#define SHOW_FUNCTION(__VAR) \ +static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype) \ +{ \ + struct blkio_cgroup *blkcg; \ + \ + blkcg = cgroup_to_blkio_cgroup(cgroup); \ + return (u64)blkcg->__VAR; \ +} + +SHOW_FUNCTION(weight); +#undef SHOW_FUNCTION + +static int +blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct hlist_node *n; + struct blkio_policy_type *blkiop; + + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; + + blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + blkcg->weight = (unsigned int)val; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_update_group_weight_fn(blkg, + blkcg->weight); + } + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + return 0; +} + +int blkiocg_set_weight(struct cgroup *cgroup, u64 val) +{ + return blkiocg_weight_write(cgroup, NULL, val); +} + +#define SHOW_FUNCTION_PER_GROUP(__VAR) \ +static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype, struct seq_file *m) \ +{ \ + struct blkio_cgroup *blkcg; \ + struct blkio_group *blkg; \ + struct hlist_node *n; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + blkcg = cgroup_to_blkio_cgroup(cgroup); \ + rcu_read_lock(); \ + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ + if (blkg->dev) \ + seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ + MINOR(blkg->dev), blkg->__VAR); \ + } \ + rcu_read_unlock(); \ + cgroup_unlock(); \ + return 0; \ +} + +SHOW_FUNCTION_PER_GROUP(time); +SHOW_FUNCTION_PER_GROUP(sectors); +#ifdef CONFIG_DEBUG_BLK_CGROUP +SHOW_FUNCTION_PER_GROUP(dequeue); +#endif +#undef SHOW_FUNCTION_PER_GROUP + +#ifdef CONFIG_DEBUG_BLK_CGROUP +void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->dequeue += dequeue; +} +EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats); +#endif + +struct cftype blkio_files[] = { + { + .name = "weight", + .read_u64 = blkiocg_weight_read, + .write_u64 = blkiocg_weight_write, + }, + { + .name = "time", + .read_seq_string = blkiocg_time_read, + }, + { + .name = "sectors", + .read_seq_string = blkiocg_sectors_read, + }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "dequeue", + .read_seq_string = blkiocg_dequeue_read, + }, +#endif +}; + +static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, blkio_files, + ARRAY_SIZE(blkio_files)); +} + +static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + unsigned long flags; + struct blkio_group *blkg; + void *key; + struct blkio_policy_type *blkiop; + + rcu_read_lock(); +remove_entry: + spin_lock_irqsave(&blkcg->lock, flags); + + if (hlist_empty(&blkcg->blkg_list)) { + spin_unlock_irqrestore(&blkcg->lock, flags); + goto done; + } + + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, + blkcg_node); + key = rcu_dereference(blkg->key); + __blkiocg_del_blkio_group(blkg); + + spin_unlock_irqrestore(&blkcg->lock, flags); + + /* + * This blkio_group is being unlinked as associated cgroup is going + * away. Let all the IO controlling policies know about this event. + * + * Currently this is static call to one io controlling policy. Once + * we have more policies in place, we need some dynamic registration + * of callback function. + */ + spin_lock(&blkio_list_lock); + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_unlink_group_fn(key, blkg); + spin_unlock(&blkio_list_lock); + goto remove_entry; +done: + free_css_id(&blkio_subsys, &blkcg->css); + rcu_read_unlock(); + kfree(blkcg); +} + +static struct cgroup_subsys_state * +blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg, *parent_blkcg; + + if (!cgroup->parent) { + blkcg = &blkio_root_cgroup; + goto done; + } + + /* Currently we do not support hierarchy deeper than two level (0,1) */ + parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); + if (css_depth(&parent_blkcg->css) > 0) + return ERR_PTR(-EINVAL); + + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) + return ERR_PTR(-ENOMEM); + + blkcg->weight = BLKIO_WEIGHT_DEFAULT; +done: + spin_lock_init(&blkcg->lock); + INIT_HLIST_HEAD(&blkcg->blkg_list); + + return &blkcg->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. For now we allow a task to change + * its cgroup only if it's the only owner of its ioc. + */ +static int blkiocg_can_attach(struct cgroup_subsys *subsys, + struct cgroup *cgroup, struct task_struct *tsk, + bool threadgroup) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc && atomic_read(&ioc->nr_tasks) > 1) + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct cgroup *prev, struct task_struct *tsk, + bool threadgroup) +{ + struct io_context *ioc; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc) + ioc->cgroup_changed = 1; + task_unlock(tsk); +} + +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach = blkiocg_can_attach, + .attach = blkiocg_attach, + .destroy = blkiocg_destroy, + .populate = blkiocg_populate, + .subsys_id = blkio_subsys_id, + .use_id = 1, +}; + +void blkio_policy_register(struct blkio_policy_type *blkiop) +{ + spin_lock(&blkio_list_lock); + list_add_tail(&blkiop->list, &blkio_list); + spin_unlock(&blkio_list_lock); +} +EXPORT_SYMBOL_GPL(blkio_policy_register); + +void blkio_policy_unregister(struct blkio_policy_type *blkiop) +{ + spin_lock(&blkio_list_lock); + list_del_init(&blkiop->list); + spin_unlock(&blkio_list_lock); +} +EXPORT_SYMBOL_GPL(blkio_policy_unregister); diff -urNp linux-2.6.32.48/block/blk-cgroup.h linux-2.6.32.48-openvz/block/blk-cgroup.h --- linux-2.6.32.48/block/blk-cgroup.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/block/blk-cgroup.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,127 @@ +#ifndef _BLK_CGROUP_H +#define _BLK_CGROUP_H +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include + +#ifdef CONFIG_BLK_CGROUP + +struct blkio_cgroup { + struct cgroup_subsys_state css; + unsigned int weight; + spinlock_t lock; + struct hlist_head blkg_list; +}; + +struct blkio_group { + /* An rcu protected unique identifier for the group */ + void *key; + struct hlist_node blkcg_node; + unsigned short blkcg_id; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Store cgroup path */ + char path[128]; + /* How many times this group has been removed from service tree */ + unsigned long dequeue; +#endif + /* The device MKDEV(major, minor), this group has been created for */ + dev_t dev; + + /* total disk time and nr sectors dispatched by this group */ + unsigned long time; + unsigned long sectors; +}; + +extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg); +extern void blkiocg_css_put(struct blkio_cgroup *blkcg); + +typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); +typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, + unsigned int weight); + +struct blkio_policy_ops { + blkio_unlink_group_fn *blkio_unlink_group_fn; + blkio_update_group_weight_fn *blkio_update_group_weight_fn; +}; + +struct blkio_policy_type { + struct list_head list; + struct blkio_policy_ops ops; +}; + +/* Blkio controller policy registration */ +extern void blkio_policy_register(struct blkio_policy_type *); +extern void blkio_policy_unregister(struct blkio_policy_type *); + +#else + +struct blkio_group { +}; + +struct blkio_policy_type { +}; + +static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } +static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } + +#endif + +#define BLKIO_WEIGHT_MIN 100 +#define BLKIO_WEIGHT_MAX 1000 +#define BLKIO_WEIGHT_DEFAULT 500 + +#ifdef CONFIG_DEBUG_BLK_CGROUP +static inline char *blkg_path(struct blkio_group *blkg) +{ + return blkg->path; +} +void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue); +#else +static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } +static inline void blkiocg_update_blkio_group_dequeue_stats( + struct blkio_group *blkg, unsigned long dequeue) {} +#endif + +#ifdef CONFIG_BLK_CGROUP +extern struct blkio_cgroup blkio_root_cgroup; +extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev); +extern int blkiocg_del_blkio_group(struct blkio_group *blkg); +extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, + void *key); +void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors); +#else +struct cgroup; +static inline struct blkio_cgroup * +cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } + +static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) +{ +} + +static inline int +blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } + +static inline struct blkio_group * +blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } +static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors) +{ +} +#endif +#endif /* _BLK_CGROUP_H */ diff -urNp linux-2.6.32.48/block/blk-settings.c linux-2.6.32.48-openvz/block/blk-settings.c --- linux-2.6.32.48/block/blk-settings.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/block/blk-settings.c 2011-11-21 17:40:45.000000000 -0500 @@ -9,6 +9,7 @@ #include /* for max_pfn/max_low_pfn */ #include #include +#include #include "blk.h" @@ -142,7 +143,7 @@ void blk_queue_make_request(struct reque q->nr_batching = BLK_BATCH_REQ; q->unplug_thresh = 4; /* hmm */ - q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ + q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */ if (q->unplug_delay == 0) q->unplug_delay = 1; diff -urNp linux-2.6.32.48/block/bsg.c linux-2.6.32.48-openvz/block/bsg.c --- linux-2.6.32.48/block/bsg.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/block/bsg.c 2011-11-21 17:40:45.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct r rq->cmd_len = hdr->request_len; rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->timeout = (hdr->timeout * HZ) / 1000; + rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) rq->timeout = q->sg_timeout; if (!rq->timeout) diff -urNp linux-2.6.32.48/block/cfq-iosched.c linux-2.6.32.48-openvz/block/cfq-iosched.c --- linux-2.6.32.48/block/cfq-iosched.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/block/cfq-iosched.c 2011-11-21 18:23:21.000000000 -0500 @@ -9,9 +9,11 @@ #include #include #include +#include #include #include #include +#include "blk-cgroup.h" /* * tunables @@ -27,6 +29,8 @@ static const int cfq_slice_sync = HZ / 1 static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ +static const int cfq_hist_divisor = 4; /* * offset from end of service tree @@ -38,14 +42,12 @@ static int cfq_slice_idle = HZ / 125; */ #define CFQ_MIN_TT (2) -/* - * Allow merged cfqqs to perform this amount of seeky I/O before - * deciding to break the queues up again. - */ -#define CFQQ_COOP_TOUT (HZ) - #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) +#define CFQ_SERVICE_SHIFT 12 + +#define CFQQ_SEEK_THR 8 * 1024 +#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) @@ -63,6 +65,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock); #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define sample_valid(samples) ((samples) > 80) +#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) /* * Most of our rbtree usage is for sorting with min extraction, so @@ -73,8 +76,12 @@ static DEFINE_SPINLOCK(ioc_gone_lock); struct cfq_rb_root { struct rb_root rb; struct rb_node *left; + unsigned count; + u64 min_vdisktime; + struct rb_node *active; + unsigned total_weight; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } /* * Per process-grouping structure @@ -105,6 +112,11 @@ struct cfq_queue { /* fifo list of requests in sort_list */ struct list_head fifo; + /* time when queue got scheduled in to dispatch first request. */ + unsigned long dispatch_start; + unsigned int allocated_slice; + /* time when first request from queue completed and slice started. */ + unsigned long slice_start; unsigned long slice_end; long slice_resid; unsigned int slice_dispatch; @@ -122,11 +134,66 @@ struct cfq_queue { u64 seek_total; sector_t seek_mean; sector_t last_request_pos; - unsigned long seeky_start; pid_t pid; + struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; + struct cfq_group *cfqg; + struct cfq_group *orig_cfqg; + /* Sectors dispatched in current dispatch round */ + unsigned long nr_sectors; +}; + +/* + * First index in the service_trees. + * IDLE is handled separately, so it has negative index + */ +enum wl_prio_t { + BE_WORKLOAD = 0, + RT_WORKLOAD = 1, + IDLE_WORKLOAD = 2, +}; + +/* + * Second index in the service_trees. + */ +enum wl_type_t { + ASYNC_WORKLOAD = 0, + SYNC_NOIDLE_WORKLOAD = 1, + SYNC_WORKLOAD = 2 +}; + +/* This is per cgroup per device grouping structure */ +struct cfq_group { + /* group service_tree member */ + struct rb_node rb_node; + + /* group service_tree key */ + u64 vdisktime; + unsigned int weight; + bool on_st; + + /* number of cfqq currently on this group */ + int nr_cfqq; + + /* Per group busy queus average. Useful for workload slice calc. */ + unsigned int busy_queues_avg[2]; + /* + * rr lists of queues with requests, onle rr for each priority class. + * Counts are embedded in the cfq_rb_root + */ + struct cfq_rb_root service_trees[2][3]; + struct cfq_rb_root service_tree_idle; + + unsigned long saved_workload_slice; + enum wl_type_t saved_workload; + enum wl_prio_t saved_serving_prio; + struct blkio_group blkg; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + struct hlist_node cfqd_node; + atomic_t ref; +#endif }; /* @@ -134,11 +201,18 @@ struct cfq_queue { */ struct cfq_data { struct request_queue *queue; + /* Root service tree for cfq_groups */ + struct cfq_rb_root grp_service_tree; + struct cfq_group root_group; /* - * rr list of queues with requests and the count of them - */ - struct cfq_rb_root service_tree; + * The priority currently being served + */ + enum wl_prio_t serving_prio; + enum wl_type_t serving_type; + unsigned long workload_expires; + struct cfq_group *serving_group; + bool noidle_tree_requires_idle; /* * Each priority tree is sorted by next_request position. These @@ -157,8 +231,14 @@ struct cfq_data { */ int rq_queued; int hw_tag; - int hw_tag_samples; - int rq_in_driver_peak; + /* + * hw_tag can be + * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) + * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) + * 0 => no NCQ + */ + int hw_tag_est_depth; + unsigned int hw_tag_samples; /* * idle window management @@ -188,6 +268,7 @@ struct cfq_data { unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; unsigned int cfq_latency; + unsigned int cfq_group_isolation; struct list_head cic_list; @@ -196,9 +277,28 @@ struct cfq_data { */ struct cfq_queue oom_cfqq; - unsigned long last_end_sync_rq; + unsigned long last_delayed_sync; + + /* List of cfq groups being managed on this device*/ + struct hlist_head cfqg_list; + struct rcu_head rcu; }; +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); + +static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, + enum wl_prio_t prio, + enum wl_type_t type) +{ + if (!cfqg) + return NULL; + + if (prio == IDLE_WORKLOAD) + return &cfqg->service_tree_idle; + + return &cfqg->service_trees[prio][type]; +} + enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ @@ -210,6 +310,9 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ + CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ + CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ + CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ }; #define CFQ_CFQQ_FNS(name) \ @@ -236,13 +339,78 @@ CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); +CFQ_CFQQ_FNS(split_coop); +CFQ_CFQQ_FNS(deep); +CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS +#ifdef CONFIG_DEBUG_CFQ_IOSCHED +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blkg_path(&(cfqq)->cfqg->blkg), ##args); + +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ + blkg_path(&(cfqg)->blkg), ##args); \ + +#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); +#endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) +/* Traverses through cfq group service trees */ +#define for_each_cfqg_st(cfqg, i, j, st) \ + for (i = 0; i <= IDLE_WORKLOAD; i++) \ + for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ + : &cfqg->service_tree_idle; \ + (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ + (i == IDLE_WORKLOAD && j == 0); \ + j++, st = i < IDLE_WORKLOAD ? \ + &cfqg->service_trees[i][j]: NULL) \ + + +static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) +{ + if (cfq_class_idle(cfqq)) + return IDLE_WORKLOAD; + if (cfq_class_rt(cfqq)) + return RT_WORKLOAD; + return BE_WORKLOAD; +} + + +static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) +{ + if (!cfq_cfqq_sync(cfqq)) + return ASYNC_WORKLOAD; + if (!cfq_cfqq_idle_window(cfqq)) + return SYNC_NOIDLE_WORKLOAD; + return SYNC_WORKLOAD; +} + +static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, + struct cfq_data *cfqd, + struct cfq_group *cfqg) +{ + if (wl == IDLE_WORKLOAD) + return cfqg->service_tree_idle.count; + + return cfqg->service_trees[wl][ASYNC_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_WORKLOAD].count; +} + +static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg) +{ + return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; +} + static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, struct io_context *, gfp_t); @@ -291,7 +459,7 @@ static int cfq_queue_empty(struct reques { struct cfq_data *cfqd = q->elevator->elevator_data; - return !cfqd->busy_queues; + return !cfqd->rq_queued; } /* @@ -315,10 +483,110 @@ cfq_prio_to_slice(struct cfq_data *cfqd, return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } +static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +{ + u64 d = delta << CFQ_SERVICE_SHIFT; + + d = d * BLKIO_WEIGHT_DEFAULT; + do_div(d, cfqg->weight); + return d; +} + +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta > 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta < 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static void update_min_vdisktime(struct cfq_rb_root *st) +{ + u64 vdisktime = st->min_vdisktime; + struct cfq_group *cfqg; + + if (st->active) { + cfqg = rb_entry_cfqg(st->active); + vdisktime = cfqg->vdisktime; + } + + if (st->left) { + cfqg = rb_entry_cfqg(st->left); + vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); + } + + st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); +} + +/* + * get averaged number of queues of RT/BE priority. + * average is updated, with a formula that gives more weight to higher numbers, + * to quickly follows sudden increases and decrease slowly + */ + +static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg, bool rt) +{ + unsigned min_q, max_q; + unsigned mult = cfq_hist_divisor - 1; + unsigned round = cfq_hist_divisor / 2; + unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); + + min_q = min(cfqg->busy_queues_avg[rt], busy); + max_q = max(cfqg->busy_queues_avg[rt], busy); + cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / + cfq_hist_divisor; + return cfqg->busy_queues_avg[rt]; +} + +static inline unsigned +cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + return cfq_target_latency * cfqg->weight / st->total_weight; +} + static inline void cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; + unsigned slice = cfq_prio_to_slice(cfqd, cfqq); + if (cfqd->cfq_latency) { + /* + * interested queues (we consider only the ones with the same + * priority class in the cfq group) + */ + unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, + cfq_class_rt(cfqq)); + unsigned sync_slice = cfqd->cfq_slice[1]; + unsigned expect_latency = sync_slice * iq; + unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); + + if (expect_latency > group_slice) { + unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; + /* scale low_slice according to IO priority + * and sync vs async */ + unsigned low_slice = + min(slice, base_low_slice * slice / sync_slice); + /* the adapted slice value is scaled to fit all iqs + * into the target latency */ + slice = max(slice * group_slice / expect_latency, + low_slice); + } + } + cfqq->slice_start = jiffies; + cfqq->slice_end = jiffies + slice; + cfqq->allocated_slice = slice; cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } @@ -343,9 +611,9 @@ static inline bool cfq_slice_used(struct * behind the head is penalized and only allowed to a certain extent. */ static struct request * -cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) +cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) { - sector_t last, s1, s2, d1 = 0, d2 = 0; + sector_t s1, s2, d1 = 0, d2 = 0; unsigned long back_max; #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ @@ -368,8 +636,6 @@ cfq_choose_req(struct cfq_data *cfqd, st s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); - last = cfqd->last_position; - /* * by definition, 1KiB is 2 sectors */ @@ -437,6 +703,10 @@ cfq_choose_req(struct cfq_data *cfqd, st */ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) { + /* Service tree is empty */ + if (!root->count) + return NULL; + if (!root->left) root->left = rb_first(&root->rb); @@ -446,6 +716,17 @@ static struct cfq_queue *cfq_rb_first(st return NULL; } +static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) +{ + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_cfqg(root->left); + + return NULL; +} + static void rb_erase_init(struct rb_node *n, struct rb_root *root) { rb_erase(n, root); @@ -457,6 +738,7 @@ static void cfq_rb_erase(struct rb_node if (root->left == n) root->left = NULL; rb_erase_init(n, &root->rb); + --root->count; } /* @@ -483,7 +765,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, next = rb_entry_rq(rbnext); } - return cfq_choose_req(cfqd, next, prev); + return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); } static unsigned long cfq_slice_offset(struct cfq_data *cfqd, @@ -492,12 +774,334 @@ static unsigned long cfq_slice_offset(st /* * just an approximation, should be ok. */ - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - + return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); } +static inline s64 +cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + return cfqg->vdisktime - st->min_vdisktime; +} + +static void +__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct cfq_group *__cfqg; + s64 key = cfqg_key(st, cfqg); + int left = 1; + + while (*node != NULL) { + parent = *node; + __cfqg = rb_entry_cfqg(parent); + + if (key < cfqg_key(st, __cfqg)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &cfqg->rb_node; + + rb_link_node(&cfqg->rb_node, parent, node); + rb_insert_color(&cfqg->rb_node, &st->rb); +} + +static void +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *__cfqg; + struct rb_node *n; + + cfqg->nr_cfqq++; + if (cfqg->on_st) + return; + + /* + * Currently put the group at the end. Later implement something + * so that groups get lesser vtime based on their weights, so that + * if group does not loose all if it was not continously backlogged. + */ + n = rb_last(&st->rb); + if (n) { + __cfqg = rb_entry_cfqg(n); + cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; + } else + cfqg->vdisktime = st->min_vdisktime; + + __cfq_group_service_tree_add(st, cfqg); + cfqg->on_st = true; + st->total_weight += cfqg->weight; +} + +static void +cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + if (st->active == &cfqg->rb_node) + st->active = NULL; + + BUG_ON(cfqg->nr_cfqq < 1); + cfqg->nr_cfqq--; + + /* If there are other cfq queues under this group, don't delete it */ + if (cfqg->nr_cfqq) + return; + + cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); + cfqg->on_st = false; + st->total_weight -= cfqg->weight; + if (!RB_EMPTY_NODE(&cfqg->rb_node)) + cfq_rb_erase(&cfqg->rb_node, st); + cfqg->saved_workload_slice = 0; + blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); +} + +static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) +{ + unsigned int slice_used; + + /* + * Queue got expired before even a single request completed or + * got expired immediately after first request completion. + */ + if (!cfqq->slice_start || cfqq->slice_start == jiffies) { + /* + * Also charge the seek time incurred to the group, otherwise + * if there are mutiple queues in the group, each can dispatch + * a single request on seeky media and cause lots of seek time + * and group will never know it. + */ + slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), + 1); + } else { + slice_used = jiffies - cfqq->slice_start; + if (slice_used > cfqq->allocated_slice) + slice_used = cfqq->allocated_slice; + } + + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, + cfqq->nr_sectors); + return slice_used; +} + +static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, + struct cfq_queue *cfqq) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + unsigned int used_sl, charge_sl; + int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) + - cfqg->service_tree_idle.count; + + BUG_ON(nr_sync < 0); + used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); + + if (!cfq_cfqq_sync(cfqq) && !nr_sync) + charge_sl = cfqq->allocated_slice; + + /* Can't update vdisktime while group is on service tree */ + cfq_rb_erase(&cfqg->rb_node, st); + cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); + __cfq_group_service_tree_add(st, cfqg); + + /* This group is being expired. Save the context */ + if (time_after(cfqd->workload_expires, jiffies)) { + cfqg->saved_workload_slice = cfqd->workload_expires + - jiffies; + cfqg->saved_workload = cfqd->serving_type; + cfqg->saved_serving_prio = cfqd->serving_prio; + } else + cfqg->saved_workload_slice = 0; + + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, + st->min_vdisktime); + blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, + cfqq->nr_sectors); +} + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct cfq_group, blkg); + return NULL; +} + +void +cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + cfqg_of_blkg(blkg)->weight = weight; +} + +static struct cfq_group * +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct cfq_group *cfqg = NULL; + void *key = cfqd; + int i, j; + struct cfq_rb_root *st; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + + /* Do we need to take this reference */ + if (!blkiocg_css_tryget(blkcg)) + return NULL;; + + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg || !create) + goto done; + + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); + if (!cfqg) + goto done; + + cfqg->weight = blkcg->weight; + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&cfqg->ref, 1); + + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + MKDEV(major, minor)); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + +done: + blkiocg_css_put(blkcg); + return cfqg; +} + +/* + * Search for the cfq group current task belongs to. If create = 1, then also + * create the cfq group if it does not exist. request_queue lock must be held. + */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + struct cgroup *cgroup; + struct cfq_group *cfqg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); + if (!cfqg && create) + cfqg = &cfqd->root_group; + rcu_read_unlock(); + return cfqg; +} + +static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) +{ + /* Currently, all async queues are mapped to root group */ + if (!cfq_cfqq_sync(cfqq)) + cfqg = &cfqq->cfqd->root_group; + + cfqq->cfqg = cfqg; + /* cfqq reference on cfqg */ + atomic_inc(&cfqq->cfqg->ref); +} + +static void cfq_put_cfqg(struct cfq_group *cfqg) +{ + struct cfq_rb_root *st; + int i, j; + + BUG_ON(atomic_read(&cfqg->ref) <= 0); + if (!atomic_dec_and_test(&cfqg->ref)) + return; + for_each_cfqg_st(cfqg, i, j, st) + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); + kfree(cfqg); +} + +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); + + hlist_del_init(&cfqg->cfqd_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + cfq_put_cfqg(cfqg); +} + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) +{ + struct hlist_node *pos, *n; + struct cfq_group *cfqg; + + hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&cfqg->blkg)) + cfq_destroy_cfqg(cfqd, cfqg); + } +} + /* - * The cfqd->service_tree holds all pending cfq_queue's that have + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu + * read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if elevator was exiting, cgroup deltion + * path got to it first. + */ +void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct cfq_data *cfqd = key; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + +#else /* GROUP_IOSCHED */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + return &cfqd->root_group; +} +static inline void +cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { + cfqq->cfqg = cfqg; +} + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} +static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} + +#endif /* GROUP_IOSCHED */ + +/* + * The cfqd->service_trees holds all pending cfq_queue's that have * requests waiting to be processed. It is sorted in the order that * we will service the queues. */ @@ -507,11 +1111,42 @@ static void cfq_service_tree_add(struct struct rb_node **p, *parent; struct cfq_queue *__cfqq; unsigned long rb_key; + struct cfq_rb_root *service_tree; int left; + int new_cfqq = 1; + int group_changed = 0; + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfqd->cfq_group_isolation + && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD + && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { + /* Move this cfq to root group */ + cfq_log_cfqq(cfqd, cfqq, "moving to root group"); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_group_service_tree_del(cfqd, cfqq->cfqg); + cfqq->orig_cfqg = cfqq->cfqg; + cfqq->cfqg = &cfqd->root_group; + atomic_inc(&cfqd->root_group.ref); + group_changed = 1; + } else if (!cfqd->cfq_group_isolation + && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { + /* cfqq is sequential now needs to go to its original group */ + BUG_ON(cfqq->cfqg != &cfqd->root_group); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_group_service_tree_del(cfqd, cfqq->cfqg); + cfq_put_cfqg(cfqq->cfqg); + cfqq->cfqg = cfqq->orig_cfqg; + cfqq->orig_cfqg = NULL; + group_changed = 1; + cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); + } +#endif + service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), + cfqq_type(cfqq)); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&cfqd->service_tree.rb); + parent = rb_last(&service_tree->rb); if (parent && parent != &cfqq->rb_node) { __cfqq = rb_entry(parent, struct cfq_queue, rb_node); rb_key += __cfqq->rb_key; @@ -529,23 +1164,27 @@ static void cfq_service_tree_add(struct cfqq->slice_resid = 0; } else { rb_key = -HZ; - __cfqq = cfq_rb_first(&cfqd->service_tree); + __cfqq = cfq_rb_first(service_tree); rb_key += __cfqq ? __cfqq->rb_key : jiffies; } if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + new_cfqq = 0; /* * same position, nothing more to do */ - if (rb_key == cfqq->rb_key) + if (rb_key == cfqq->rb_key && + cfqq->service_tree == service_tree) return; - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); + cfqq->service_tree = NULL; } left = 1; parent = NULL; - p = &cfqd->service_tree.rb.rb_node; + cfqq->service_tree = service_tree; + p = &service_tree->rb.rb_node; while (*p) { struct rb_node **n; @@ -553,35 +1192,28 @@ static void cfq_service_tree_add(struct __cfqq = rb_entry(parent, struct cfq_queue, rb_node); /* - * sort RT queues first, we always want to give - * preference to them. IDLE queues goes to the back. - * after that, sort on the next service time. + * sort by key, that represents service time. */ - if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) - n = &(*p)->rb_right; - else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) - n = &(*p)->rb_right; - else if (time_before(rb_key, __cfqq->rb_key)) + if (time_before(rb_key, __cfqq->rb_key)) n = &(*p)->rb_left; - else + else { n = &(*p)->rb_right; - - if (n == &(*p)->rb_right) left = 0; + } p = n; } if (left) - cfqd->service_tree.left = &cfqq->rb_node; + service_tree->left = &cfqq->rb_node; cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); + rb_insert_color(&cfqq->rb_node, &service_tree->rb); + service_tree->count++; + if ((add_front || !new_cfqq) && !group_changed) + return; + cfq_group_service_tree_add(cfqd, cfqq->cfqg); } static struct cfq_queue * @@ -683,13 +1315,16 @@ static void cfq_del_cfqq_rr(struct cfq_d BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); + cfqq->service_tree = NULL; + } if (cfqq->p_root) { rb_erase(&cfqq->p_node, cfqq->p_root); cfqq->p_root = NULL; } + cfq_group_service_tree_del(cfqd, cfqq->cfqg); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; } @@ -700,7 +1335,6 @@ static void cfq_del_cfqq_rr(struct cfq_d static void cfq_del_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); @@ -708,8 +1342,17 @@ static void cfq_del_rq_rb(struct request elv_rb_del(&cfqq->sort_list, rq); - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { + /* + * Queue will be deleted from service tree when we actually + * expire it later. Right now just remove it from prio tree + * as it is empty. + */ + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } + } } static void cfq_add_rq_rb(struct request *rq) @@ -734,7 +1377,7 @@ static void cfq_add_rq_rb(struct request * check if this request is a better next-serve candidate */ prev = cfqq->next_rq; - cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); + cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); /* * adjust priority tree position, if ->next_rq changes @@ -841,6 +1484,7 @@ static void cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { + struct cfq_queue *cfqq = RQ_CFQQ(rq); /* * reposition in fifo if next is older than rq */ @@ -850,6 +1494,8 @@ cfq_merged_requests(struct request_queue rq_set_fifo_time(rq, rq_fifo_time(next)); } + if (cfqq->next_rq == next) + cfqq->next_rq = rq; cfq_remove_request(next); } @@ -883,8 +1529,12 @@ static void __cfq_set_active_queue(struc { if (cfqq) { cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfqq->slice_start = 0; + cfqq->dispatch_start = jiffies; + cfqq->allocated_slice = 0; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; + cfqq->nr_sectors = 0; cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -911,6 +1561,16 @@ __cfq_slice_expired(struct cfq_data *cfq del_timer(&cfqd->idle_slice_timer); cfq_clear_cfqq_wait_request(cfqq); + cfq_clear_cfqq_wait_busy(cfqq); + + /* + * If this cfqq is shared between multiple processes, check to + * make sure that those processes are still issuing I/Os within + * the mean seek distance. If not, it may be time to break the + * queues apart again. + */ + if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) + cfq_mark_cfqq_split_coop(cfqq); /* * store what was left of this slice, if the queue idled/timed out @@ -920,11 +1580,19 @@ __cfq_slice_expired(struct cfq_data *cfq cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); } + cfq_group_served(cfqd, cfqq->cfqg, cfqq); + + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) + cfq_del_cfqq_rr(cfqd, cfqq); + cfq_resort_rr_list(cfqd, cfqq); if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; + if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) + cfqd->grp_service_tree.active = NULL; + if (cfqd->active_cic) { put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; @@ -945,10 +1613,39 @@ static inline void cfq_slice_expired(str */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) + struct cfq_rb_root *service_tree = + service_tree_for(cfqd->serving_group, cfqd->serving_prio, + cfqd->serving_type); + + if (!cfqd->rq_queued) + return NULL; + + /* There is nothing to dispatch */ + if (!service_tree) + return NULL; + if (RB_EMPTY_ROOT(&service_tree->rb)) + return NULL; + return cfq_rb_first(service_tree); +} + +static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg; + struct cfq_queue *cfqq; + int i, j; + struct cfq_rb_root *st; + + if (!cfqd->rq_queued) + return NULL; + + cfqg = cfq_get_next_cfqg(cfqd); + if (!cfqg) return NULL; - return cfq_rb_first(&cfqd->service_tree); + for_each_cfqg_st(cfqg, i, j, st) + if ((cfqq = cfq_rb_first(st)) != NULL) + return cfqq; + return NULL; } /* @@ -973,17 +1670,18 @@ static inline sector_t cfq_dist_from_las return cfqd->last_position - blk_rq_pos(rq); } -#define CFQQ_SEEK_THR 8 * 1024 -#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) - static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq) + struct request *rq, bool for_preempt) { sector_t sdist = cfqq->seek_mean; if (!sample_valid(cfqq->seek_samples)) sdist = CFQQ_SEEK_THR; + /* if seek_mean is big, using it as close criteria is meaningless */ + if (sdist > CFQQ_SEEK_THR && !for_preempt) + sdist = CFQQ_SEEK_THR; + return cfq_dist_from_last(cfqd, rq) <= sdist; } @@ -1011,7 +1709,7 @@ static struct cfq_queue *cfqq_close(stru * will contain the closest sector. */ __cfqq = rb_entry(parent, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) return __cfqq; if (blk_rq_pos(__cfqq->next_rq) < sector) @@ -1022,7 +1720,7 @@ static struct cfq_queue *cfqq_close(stru return NULL; __cfqq = rb_entry(node, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) return __cfqq; return NULL; @@ -1049,6 +1747,12 @@ static struct cfq_queue *cfq_close_coope return NULL; /* + * Don't search priority tree if it's the only queue in the group. + */ + if (cur_cfqq->cfqg->nr_cfqq == 1) + return NULL; + + /* * We should notice if some of the queues are cooperating, eg * working closely on the same area of the disk. In that case, * we can group them together and don't waste time idling. @@ -1057,6 +1761,10 @@ static struct cfq_queue *cfq_close_coope if (!cfqq) return NULL; + /* If new queue belongs to different cfq_group, don't choose it */ + if (cur_cfqq->cfqg != cfqq->cfqg) + return NULL; + /* * It only makes sense to merge sync queues. */ @@ -1065,9 +1773,43 @@ static struct cfq_queue *cfq_close_coope if (CFQQ_SEEKY(cfqq)) return NULL; + /* + * Do not merge queues of different priority classes + */ + if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) + return NULL; + return cfqq; } +/* + * Determine whether we should enforce idle window for this queue. + */ + +static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + enum wl_prio_t prio = cfqq_prio(cfqq); + struct cfq_rb_root *service_tree = cfqq->service_tree; + + BUG_ON(!service_tree); + BUG_ON(!service_tree->count); + + /* We never do for idle class queues. */ + if (prio == IDLE_WORKLOAD) + return false; + + /* We do for queues that were marked with idle window flag. */ + if (cfq_cfqq_idle_window(cfqq) && + !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) + return true; + + /* + * Otherwise, we do only if they are the last ones + * in their service tree. + */ + return service_tree->count == 1 && cfq_cfqq_sync(cfqq); +} + static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; @@ -1088,13 +1830,13 @@ static void cfq_arm_slice_timer(struct c /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) + if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) return; /* - * still requests with the driver, don't idle + * still active requests from this queue, don't idle */ - if (rq_in_driver(cfqd)) + if (cfqq->dispatched) return; /* @@ -1115,14 +1857,7 @@ static void cfq_arm_slice_timer(struct c cfq_mark_cfqq_wait_request(cfqq); - /* - * we don't want to idle for seeks, but we do want to allow - * fair distribution of slice time for a process doing back-to-back - * seeks. so allow a little bit of time for him to submit a new rq - */ sl = cfqd->cfq_slice_idle; - if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)) - sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); @@ -1145,6 +1880,7 @@ static void cfq_dispatch_insert(struct r if (cfq_cfqq_sync(cfqq)) cfqd->sync_flight++; + cfqq->nr_sectors += blk_rq_sectors(rq); } /* @@ -1198,15 +1934,6 @@ static void cfq_setup_merge(struct cfq_q int process_refs, new_process_refs; struct cfq_queue *__cfqq; - /* - * If there are no process references on the new_cfqq, then it is - * unsafe to follow the ->new_cfqq chain as other cfqq's in the - * chain may have dropped their last reference (not just their - * last process reference). - */ - if (!cfqq_process_refs(new_cfqq)) - return; - /* Avoid a circular list and skip interim queue merges */ while ((__cfqq = new_cfqq->new_cfqq)) { if (__cfqq == cfqq) @@ -1215,17 +1942,17 @@ static void cfq_setup_merge(struct cfq_q } process_refs = cfqq_process_refs(cfqq); - new_process_refs = cfqq_process_refs(new_cfqq); /* * If the process for the cfqq has gone away, there is no * sense in merging the queues. */ - if (process_refs == 0 || new_process_refs == 0) + if (process_refs == 0) return; /* * Merge in the direction of the lesser amount of work. */ + new_process_refs = cfqq_process_refs(new_cfqq); if (new_process_refs >= process_refs) { cfqq->new_cfqq = new_cfqq; atomic_add(process_refs, &new_cfqq->ref); @@ -1235,6 +1962,140 @@ static void cfq_setup_merge(struct cfq_q } } +static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, + struct cfq_group *cfqg, enum wl_prio_t prio) +{ + struct cfq_queue *queue; + int i; + bool key_valid = false; + unsigned long lowest_key = 0; + enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; + + for (i = 0; i <= SYNC_WORKLOAD; ++i) { + /* select the one with lowest rb_key */ + queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); + if (queue && + (!key_valid || time_before(queue->rb_key, lowest_key))) { + lowest_key = queue->rb_key; + cur_best = i; + key_valid = true; + } + } + + return cur_best; +} + +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + unsigned slice; + unsigned count; + struct cfq_rb_root *st; + unsigned group_slice; + + if (!cfqg) { + cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->workload_expires = jiffies + 1; + return; + } + + /* Choose next priority. RT > BE > IDLE */ + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) + cfqd->serving_prio = RT_WORKLOAD; + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) + cfqd->serving_prio = BE_WORKLOAD; + else { + cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->workload_expires = jiffies + 1; + return; + } + + /* + * For RT and BE, we have to choose also the type + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload + * expiration time + */ + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); + count = st->count; + + /* + * check workload expiration, and that we still have other queues ready + */ + if (count && !time_after(jiffies, cfqd->workload_expires)) + return; + + /* otherwise select new workload type */ + cfqd->serving_type = + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); + count = st->count; + + /* + * the workload slice is computed as a fraction of target latency + * proportional to the number of queues in that workload, over + * all the queues in the same priority class + */ + group_slice = cfq_group_slice(cfqd, cfqg); + + slice = group_slice * count / + max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], + cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); + + if (cfqd->serving_type == ASYNC_WORKLOAD) { + unsigned int tmp; + + /* + * Async queues are currently system wide. Just taking + * proportion of queues with-in same group will lead to higher + * async ratio system wide as generally root group is going + * to have higher weight. A more accurate thing would be to + * calculate system wide asnc/sync ratio. + */ + tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); + tmp = tmp/cfqd->busy_queues; + slice = min_t(unsigned, slice, tmp); + + /* async workload slice is scaled down according to + * the sync/async slice ratio. */ + slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; + } else + /* sync workload slice is at least 2 * cfq_slice_idle */ + slice = max(slice, 2 * cfqd->cfq_slice_idle); + + slice = max_t(unsigned, slice, CFQ_MIN_TT); + cfqd->workload_expires = jiffies + slice; + cfqd->noidle_tree_requires_idle = false; +} + +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *cfqg; + + if (RB_EMPTY_ROOT(&st->rb)) + return NULL; + cfqg = cfq_rb_first_group(st); + st->active = &cfqg->rb_node; + update_min_vdisktime(st); + return cfqg; +} + +static void cfq_choose_cfqg(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); + + cfqd->serving_group = cfqg; + + /* Restore the workload type data */ + if (cfqg->saved_workload_slice) { + cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; + cfqd->serving_type = cfqg->saved_workload; + cfqd->serving_prio = cfqg->saved_serving_prio; + } else + cfqd->workload_expires = jiffies - 1; + + choose_service_tree(cfqd, cfqg); +} + /* * Select a queue for service. If we have a current active queue, * check whether to continue servicing it, or retrieve and set a new one. @@ -1247,13 +2108,37 @@ static struct cfq_queue *cfq_select_queu if (!cfqq) goto new_queue; + if (!cfqd->rq_queued) + return NULL; + /* - * The active queue has run out of time, expire it and select new. + * We were waiting for group to get backlogged. Expire the queue */ - if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) + if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) goto expire; /* + * The active queue has run out of time, expire it and select new. + */ + if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { + /* + * If slice had not expired at the completion of last request + * we might not have turned on wait_busy flag. Don't expire + * the queue yet. Allow the group to get backlogged. + * + * The very fact that we have used the slice, that means we + * have been idling all along on this queue and it should be + * ok to wait for this request to complete. + */ + if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) + && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { + cfqq = NULL; + goto keep_queue; + } else + goto expire; + } + + /* * The active queue has requests and isn't expired, allow it to * dispatch. */ @@ -1279,7 +2164,7 @@ static struct cfq_queue *cfq_select_queu * conditions to happen (or time out) before selecting a new queue. */ if (timer_pending(&cfqd->idle_slice_timer) || - (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { + (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { cfqq = NULL; goto keep_queue; } @@ -1287,6 +2172,13 @@ static struct cfq_queue *cfq_select_queu expire: cfq_slice_expired(cfqd, 0); new_queue: + /* + * Current queue expired. Check if we have to switch to a new + * service tree + */ + if (!new_cfqq) + cfq_choose_cfqg(cfqd); + cfqq = cfq_set_active_queue(cfqd, new_cfqq); keep_queue: return cfqq; @@ -1302,6 +2194,9 @@ static int __cfq_forced_dispatch_cfqq(st } BUG_ON(!list_empty(&cfqq->fifo)); + + /* By default cfqq is not expired if it is empty. Do it explicitly */ + __cfq_slice_expired(cfqq->cfqd, cfqq, 0); return dispatched; } @@ -1314,11 +2209,10 @@ static int cfq_forced_dispatch(struct cf struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) + while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); cfq_slice_expired(cfqd, 0); - BUG_ON(cfqd->busy_queues); cfq_log(cfqd, "forced_dispatch=%d", dispatched); @@ -1332,7 +2226,7 @@ static bool cfq_may_dispatch(struct cfq_ /* * Drain async requests before we start sync IO */ - if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) + if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) return false; /* @@ -1362,9 +2256,9 @@ static bool cfq_may_dispatch(struct cfq_ return false; /* - * Sole queue user, allow bigger slice + * Sole queue user, no limit */ - max_dispatch *= 4; + max_dispatch = -1; } /* @@ -1373,7 +2267,7 @@ static bool cfq_may_dispatch(struct cfq_ * based on the last sync IO we serviced */ if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { - unsigned long last_sync = jiffies - cfqd->last_end_sync_rq; + unsigned long last_sync = jiffies - cfqd->last_delayed_sync; unsigned int depth; depth = last_sync / cfqd->cfq_slice[1]; @@ -1471,11 +2365,13 @@ static int cfq_dispatch_requests(struct * task holds one reference to the queue, dropped when task exits. each rq * in-flight on this queue also holds a reference, dropped when rq is freed. * + * Each cfq queue took a reference on the parent group. Drop it now. * queue lock must be held here. */ static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; + struct cfq_group *cfqg, *orig_cfqg; BUG_ON(atomic_read(&cfqq->ref) <= 0); @@ -1485,14 +2381,19 @@ static void cfq_put_queue(struct cfq_que cfq_log_cfqq(cfqd, cfqq, "put_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - BUG_ON(cfq_cfqq_on_rr(cfqq)); + cfqg = cfqq->cfqg; + orig_cfqg = cfqq->orig_cfqg; if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); cfq_schedule_dispatch(cfqd); } + BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); + cfq_put_cfqg(cfqg); + if (orig_cfqg) + cfq_put_cfqg(orig_cfqg); } /* @@ -1622,14 +2523,8 @@ static void __cfq_exit_single_io_context cic->dead_key = (unsigned long) cic->key; cic->key = NULL; - rcu_read_lock(); - if (rcu_dereference(ioc->ioc_data) == cic) { - rcu_read_unlock(); - spin_lock(&ioc->lock); + if (ioc->ioc_data == cic) rcu_assign_pointer(ioc->ioc_data, NULL); - spin_unlock(&ioc->lock); - } else - rcu_read_unlock(); if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); @@ -1791,14 +2686,51 @@ static void cfq_init_cfqq(struct cfq_dat cfqq->pid = pid; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) +{ + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); + struct cfq_data *cfqd = cic->key; + unsigned long flags; + struct request_queue *q; + + if (unlikely(!cfqd)) + return; + + q = cfqd->queue; + + spin_lock_irqsave(q->queue_lock, flags); + + if (sync_cfqq) { + /* + * Drop reference to sync queue. A new sync queue will be + * assigned in new group upon arrival of a fresh request. + */ + cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(sync_cfqq); + } + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void cfq_ioc_set_cgroup(struct io_context *ioc) +{ + call_for_each_cic(ioc, changed_cgroup); + ioc->cgroup_changed = 0; +} +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ + static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, gfp_t gfp_mask) { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct cfq_group *cfqg; retry: + cfqg = cfq_get_cfqg(cfqd, 1); cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -1829,6 +2761,7 @@ retry: if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, ioc); + cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else cfqq = &cfqd->oom_cfqq; @@ -2020,6 +2953,10 @@ out: if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (unlikely(ioc->cgroup_changed)) + cfq_ioc_set_cgroup(ioc); +#endif return cic; err_free: cfq_cic_free(cic); @@ -2067,19 +3004,6 @@ cfq_update_io_seektime(struct cfq_data * total = cfqq->seek_total + (cfqq->seek_samples/2); do_div(total, cfqq->seek_samples); cfqq->seek_mean = (sector_t)total; - - /* - * If this cfqq is shared between multiple processes, check to - * make sure that those processes are still issuing I/Os within - * the mean seek distance. If not, it may be time to break the - * queues apart again. - */ - if (cfq_cfqq_coop(cfqq)) { - if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start) - cfqq->seeky_start = jiffies; - else if (!CFQQ_SEEKY(cfqq)) - cfqq->seeky_start = 0; - } } /* @@ -2100,14 +3024,15 @@ cfq_update_idle_window(struct cfq_data * enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); + if (cfqq->queued[0] + cfqq->queued[1] >= 4) + cfq_mark_cfqq_deep(cfqq); + if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || - (!cfqd->cfq_latency && cfqd->hw_tag && CFQQ_SEEKY(cfqq))) + (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples) + && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { - unsigned int slice_idle = cfqd->cfq_slice_idle; - if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)) - slice_idle = msecs_to_jiffies(CFQ_MIN_TT); - if (cic->ttime_mean > slice_idle) + if (cic->ttime_mean > cfqd->cfq_slice_idle) enable_idle = 0; else enable_idle = 1; @@ -2136,9 +3061,6 @@ cfq_should_preempt(struct cfq_data *cfqd if (!cfqq) return false; - if (cfq_slice_used(cfqq)) - return true; - if (cfq_class_idle(new_cfqq)) return false; @@ -2146,12 +3068,31 @@ cfq_should_preempt(struct cfq_data *cfqd return true; /* + * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. + */ + if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) + return false; + + /* * if the new request is sync, but the currently running queue is * not, let the sync request have priority. */ if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) return true; + if (new_cfqq->cfqg != cfqq->cfqg) + return false; + + if (cfq_slice_used(cfqq)) + return true; + + /* Allow preemption only if we are idling on sync-noidle tree */ + if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && + cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && + new_cfqq->service_tree->count == 2 && + RB_EMPTY_ROOT(&cfqq->sort_list)) + return true; + /* * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. @@ -2172,7 +3113,7 @@ cfq_should_preempt(struct cfq_data *cfqd * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (cfq_rq_close(cfqd, cfqq, rq)) + if (cfq_rq_close(cfqd, cfqq, rq, true)) return true; return false; @@ -2234,9 +3175,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, s if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || cfqd->busy_queues > 1) { del_timer(&cfqd->idle_slice_timer); - __blk_run_queue(cfqd->queue); - } - cfq_mark_cfqq_must_dispatch(cfqq); + cfq_clear_cfqq_wait_request(cfqq); + __blk_run_queue(cfqd->queue); + } else + cfq_mark_cfqq_must_dispatch(cfqq); } } else if (cfq_should_preempt(cfqd, cfqq, rq)) { /* @@ -2258,10 +3200,9 @@ static void cfq_insert_request(struct re cfq_log_cfqq(cfqd, cfqq, "insert_request"); cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); - cfq_add_rq_rb(rq); - rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &cfqq->fifo); + cfq_add_rq_rb(rq); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -2272,23 +3213,64 @@ static void cfq_insert_request(struct re */ static void cfq_update_hw_tag(struct cfq_data *cfqd) { - if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) - cfqd->rq_in_driver_peak = rq_in_driver(cfqd); + struct cfq_queue *cfqq = cfqd->active_queue; + + if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth) + cfqd->hw_tag_est_depth = rq_in_driver(cfqd); + + if (cfqd->hw_tag == 1) + return; if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) return; + /* + * If active queue hasn't enough requests and can idle, cfq might not + * dispatch sufficient requests to hardware. Don't zero hw_tag in this + * case + */ + if (cfqq && cfq_cfqq_idle_window(cfqq) && + cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < + CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN) + return; + if (cfqd->hw_tag_samples++ < 50) return; - if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) + if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) cfqd->hw_tag = 1; else cfqd->hw_tag = 0; +} + +static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + struct cfq_io_context *cic = cfqd->active_cic; + + /* If there are other queues in the group, don't wait */ + if (cfqq->cfqg->nr_cfqq > 1) + return false; + + if (cfq_slice_used(cfqq)) + return true; + + /* if slice left is less than think time, wait busy */ + if (cic && sample_valid(cic->ttime_samples) + && (cfqq->slice_end - jiffies < cic->ttime_mean)) + return true; - cfqd->hw_tag_samples = 0; - cfqd->rq_in_driver_peak = 0; + /* + * If think times is less than a jiffy than ttime_mean=0 and above + * will not be true. It might happen that slice has not expired yet + * but will expire soon (4-5 ns) during select_queue(). To cover the + * case where think time is less than a jiffy, mark the queue wait + * busy if only 1 jiffy is left in the slice. + */ + if (cfqq->slice_end - jiffies == 1) + return true; + + return false; } static void cfq_completed_request(struct request_queue *q, struct request *rq) @@ -2299,7 +3281,7 @@ static void cfq_completed_request(struct unsigned long now; now = jiffies; - cfq_log_cfqq(cfqd, cfqq, "complete"); + cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq)); cfq_update_hw_tag(cfqd); @@ -2313,7 +3295,8 @@ static void cfq_completed_request(struct if (sync) { RQ_CIC(rq)->last_end_request = now; - cfqd->last_end_sync_rq = now; + if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) + cfqd->last_delayed_sync = now; } /* @@ -2327,18 +3310,32 @@ static void cfq_completed_request(struct cfq_set_prio_slice(cfqd, cfqq); cfq_clear_cfqq_slice_new(cfqq); } + + /* + * Should we wait for next request to come in before we expire + * the queue. + */ + if (cfq_should_wait_busy(cfqd, cfqq)) { + cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; + cfq_mark_cfqq_wait_busy(cfqq); + } + /* - * If there are no requests waiting in this queue, and - * there are other queues ready to issue requests, AND - * those other queues are issuing requests within our - * mean seek distance, give them a chance to run instead - * of idling. + * Idling is not enabled on: + * - expired queues + * - idle-priority queues + * - async queues + * - queues with still some requests queued + * - when there is a close cooperator */ if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) cfq_slice_expired(cfqd, 1); - else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq) && - sync && !rq_noidle(rq)) - cfq_arm_slice_timer(cfqd); + else if (sync && cfqq_empty && + !cfq_close_cooperator(cfqd, cfqq)) { + cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); + if (cfqd->noidle_tree_requires_idle) + cfq_arm_slice_timer(cfqd); + } } if (!rq_in_driver(cfqd)) @@ -2362,12 +3359,10 @@ static void cfq_prio_boost(struct cfq_qu cfqq->ioprio = IOPRIO_NORM; } else { /* - * check if we need to unboost the queue + * unboost the queue (if needed) */ - if (cfqq->ioprio_class != cfqq->org_ioprio_class) - cfqq->ioprio_class = cfqq->org_ioprio_class; - if (cfqq->ioprio != cfqq->org_ioprio) - cfqq->ioprio = cfqq->org_ioprio; + cfqq->ioprio_class = cfqq->org_ioprio_class; + cfqq->ioprio = cfqq->org_ioprio; } } @@ -2442,14 +3437,6 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, s return cic_to_cfqq(cic, 1); } -static int should_split_cfqq(struct cfq_queue *cfqq) -{ - if (cfqq->seeky_start && - time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT)) - return 1; - return 0; -} - /* * Returns NULL if a new cfqq should be allocated, or the old cfqq if this * was the last process referring to said cfqq. @@ -2458,9 +3445,9 @@ static struct cfq_queue * split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) { if (cfqq_process_refs(cfqq) == 1) { - cfqq->seeky_start = 0; cfqq->pid = current->pid; cfq_clear_cfqq_coop(cfqq); + cfq_clear_cfqq_split_coop(cfqq); return cfqq; } @@ -2499,7 +3486,7 @@ new_queue: /* * If the queue was seeky for too long, break it apart. */ - if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) { + if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); cfqq = split_cfqq(cic, cfqq); if (!cfqq) @@ -2588,6 +3575,11 @@ static void cfq_idle_slice_timer(unsigne */ if (!RB_EMPTY_ROOT(&cfqq->sort_list)) goto out_kick; + + /* + * Queue depth flag is reset only when the idle didn't succeed + */ + cfq_clear_cfqq_deep(cfqq); } expire: cfq_slice_expired(cfqd, timed_out); @@ -2618,6 +3610,11 @@ static void cfq_put_async_queues(struct cfq_put_queue(cfqd->async_idle_cfqq); } +static void cfq_cfqd_free(struct rcu_head *head) +{ + kfree(container_of(head, struct cfq_data, rcu)); +} + static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -2639,25 +3636,49 @@ static void cfq_exit_queue(struct elevat } cfq_put_async_queues(cfqd); + cfq_release_cfq_groups(cfqd); + blkiocg_del_blkio_group(&cfqd->root_group.blkg); spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); - kfree(cfqd); + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ + call_rcu(&cfqd->rcu, cfq_cfqd_free); } static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; - int i; + int i, j; + struct cfq_group *cfqg; + struct cfq_rb_root *st; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; - cfqd->service_tree = CFQ_RB_ROOT; + /* Init root service tree */ + cfqd->grp_service_tree = CFQ_RB_ROOT; + /* Init root group */ + cfqg = &cfqd->root_group; + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* Give preference to root group over other groups */ + cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* + * Take a reference to root group which we never drop. This is just + * to make sure that cfq_put_cfqg() does not try to kfree root group + */ + atomic_set(&cfqg->ref, 1); + blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, + 0); +#endif /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -2673,6 +3694,7 @@ static void *cfq_init_queue(struct reque */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); atomic_inc(&cfqd->oom_cfqq.ref); + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); INIT_LIST_HEAD(&cfqd->cic_list); @@ -2694,8 +3716,14 @@ static void *cfq_init_queue(struct reque cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_latency = 1; - cfqd->hw_tag = 1; - cfqd->last_end_sync_rq = jiffies; + cfqd->cfq_group_isolation = 0; + cfqd->hw_tag = -1; + /* + * we optimistically start assuming sync ops weren't delayed in last + * second, in order to have larger depth for async operations. + */ + cfqd->last_delayed_sync = jiffies - HZ; + INIT_RCU_HEAD(&cfqd->rcu); return cfqd; } @@ -2764,6 +3792,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd- SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); +SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -2796,6 +3825,7 @@ STORE_FUNCTION(cfq_slice_async_store, &c STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); +STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -2812,6 +3842,7 @@ static struct elv_fs_entry cfq_attrs[] = CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), CFQ_ATTR(low_latency), + CFQ_ATTR(group_isolation), __ATTR_NULL }; @@ -2841,6 +3872,17 @@ static struct elevator_type iosched_cfq .elevator_owner = THIS_MODULE, }; +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static struct blkio_policy_type blkio_policy_cfq = { + .ops = { + .blkio_unlink_group_fn = cfq_unlink_blkio_group, + .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, + }, +}; +#else +static struct blkio_policy_type blkio_policy_cfq; +#endif + static int __init cfq_init(void) { /* @@ -2855,6 +3897,7 @@ static int __init cfq_init(void) return -ENOMEM; elv_register(&iosched_cfq); + blkio_policy_register(&blkio_policy_cfq); return 0; } @@ -2862,6 +3905,7 @@ static int __init cfq_init(void) static void __exit cfq_exit(void) { DECLARE_COMPLETION_ONSTACK(all_gone); + blkio_policy_unregister(&blkio_policy_cfq); elv_unregister(&iosched_cfq); ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ diff -urNp linux-2.6.32.48/block/elevator.c linux-2.6.32.48-openvz/block/elevator.c --- linux-2.6.32.48/block/elevator.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/block/elevator.c 2011-11-21 17:40:45.000000000 -0500 @@ -959,12 +959,12 @@ void elv_unregister(struct elevator_type */ if (e->ops.trim) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (p->io_context) e->ops.trim(p->io_context); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } diff -urNp linux-2.6.32.48/block/genhd.c linux-2.6.32.48-openvz/block/genhd.c --- linux-2.6.32.48/block/genhd.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/block/genhd.c 2011-11-21 17:40:45.000000000 -0500 @@ -22,9 +22,7 @@ #include "blk.h" static DEFINE_MUTEX(block_class_lock); -#ifndef CONFIG_SYSFS_DEPRECATED struct kobject *block_depr; -#endif /* for extended dynamic devt allocation, currently only one major is used */ #define MAX_EXT_DEVT (1 << MINORBITS) @@ -793,7 +791,7 @@ static int __init genhd_device_init(void { int error; - block_class.dev_kobj = sysfs_dev_block_kobj; + block_class.dev_kobj = ve_sysfs_dev_block_kobj; error = class_register(&block_class); if (unlikely(error)) return error; @@ -802,10 +800,10 @@ static int __init genhd_device_init(void register_blkdev(BLOCK_EXT_MAJOR, "blkext"); -#ifndef CONFIG_SYSFS_DEPRECATED - /* create top-level block dir */ - block_depr = kobject_create_and_add("block", NULL); -#endif + if (!sysfs_deprecated) + /* create top-level block dir */ + block_depr = kobject_create_and_add("block", NULL); + return 0; } @@ -997,6 +995,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", }; +EXPORT_SYMBOL(block_class); static char *block_devnode(struct device *dev, mode_t *mode) { diff -urNp linux-2.6.32.48/block/Kconfig linux-2.6.32.48-openvz/block/Kconfig --- linux-2.6.32.48/block/Kconfig 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/block/Kconfig 2011-11-21 17:40:45.000000000 -0500 @@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_CGROUP + bool + depends on CGROUPS + default n + ---help--- + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. + +config DEBUG_BLK_CGROUP + bool + depends on BLK_CGROUP + default n + ---help--- + Enable some debugging help. Currently it stores the cgroup path + in the blk group which can be used by cfq for tracing various + group related activity. + endif # BLOCK config BLOCK_COMPAT diff -urNp linux-2.6.32.48/block/Kconfig.iosched linux-2.6.32.48-openvz/block/Kconfig.iosched --- linux-2.6.32.48/block/Kconfig.iosched 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/block/Kconfig.iosched 2011-11-21 17:40:45.000000000 -0500 @@ -40,6 +40,23 @@ config IOSCHED_CFQ working environment, suitable for desktop systems. This is the default I/O scheduler. +config CFQ_GROUP_IOSCHED + bool "CFQ Group Scheduling support" + depends on IOSCHED_CFQ && CGROUPS + select BLK_CGROUP + default n + ---help--- + Enable group IO scheduling in CFQ. + +config DEBUG_CFQ_IOSCHED + bool "Debug CFQ Scheduling" + depends on CFQ_GROUP_IOSCHED + select DEBUG_BLK_CGROUP + default n + ---help--- + Enable CFQ IO scheduling debugging in CFQ. Currently it makes + blktrace output more verbose. + choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff -urNp linux-2.6.32.48/block/Makefile linux-2.6.32.48-openvz/block/Makefile --- linux-2.6.32.48/block/Makefile 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/block/Makefile 2011-11-21 17:40:45.000000000 -0500 @@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-co blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o diff -urNp linux-2.6.32.48/COPYING.Parallels linux-2.6.32.48-openvz/COPYING.Parallels --- linux-2.6.32.48/COPYING.Parallels 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/COPYING.Parallels 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,350 @@ + +Nothing in this license should be construed as a grant by Parallels of any rights +beyond the rights specified in the GNU General Public License, and nothing in +this license should be construed as a waiver by Parallels of its patent, copyright +and/or trademark rights, beyond the waiver required by the GNU General Public +License. This license is expressly inapplicable to any product that is not +within the scope of the GNU General Public License + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff -urNp linux-2.6.32.48/drivers/base/base.h linux-2.6.32.48-openvz/drivers/base/base.h --- linux-2.6.32.48/drivers/base/base.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/base/base.h 2011-11-21 17:40:45.000000000 -0500 @@ -129,7 +129,12 @@ extern char *make_class_name(const char extern int devres_release_all(struct device *dev); +#ifndef CONFIG_VE extern struct kset *devices_kset; +#define ve_devices_kset devices_kset +#else +#define ve_devices_kset (get_exec_env()->devices_kset) +#endif #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS) extern void module_add_driver(struct module *mod, struct device_driver *drv); diff -urNp linux-2.6.32.48/drivers/base/bus.c linux-2.6.32.48-openvz/drivers/base/bus.c --- linux-2.6.32.48/drivers/base/bus.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/base/bus.c 2011-11-21 17:40:45.000000000 -0500 @@ -439,21 +439,20 @@ static void device_remove_attrs(struct b } } -#ifdef CONFIG_SYSFS_DEPRECATED static int make_deprecated_bus_links(struct device *dev) { - return sysfs_create_link(&dev->kobj, - &dev->bus->p->subsys.kobj, "bus"); + if (sysfs_deprecated) + return sysfs_create_link(&dev->kobj, + &dev->bus->p->subsys.kobj, "bus"); + else + return 0; } static void remove_deprecated_bus_links(struct device *dev) { - sysfs_remove_link(&dev->kobj, "bus"); + if (sysfs_deprecated) + sysfs_remove_link(&dev->kobj, "bus"); } -#else -static inline int make_deprecated_bus_links(struct device *dev) { return 0; } -static inline void remove_deprecated_bus_links(struct device *dev) { } -#endif /** * bus_add_device - add device to bus diff -urNp linux-2.6.32.48/drivers/base/class.c linux-2.6.32.48-openvz/drivers/base/class.c --- linux-2.6.32.48/drivers/base/class.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/base/class.c 2011-11-21 17:40:45.000000000 -0500 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include "base.h" #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) @@ -74,8 +76,14 @@ static struct kobj_type class_ktype = { }; /* Hotplug events for classes go to the class class_subsys */ -static struct kset *class_kset; +#ifndef CONFIG_VE +struct kset *class_kset; +EXPORT_SYMBOL_GPL(class_kset); +#define visible_class_kset class_kset +#else +#define visible_class_kset (get_exec_env()->class_kset) +#endif int class_create_file(struct class *cls, const struct class_attribute *attr) { @@ -173,14 +181,14 @@ int __class_register(struct class *cls, /* set the default /sys/dev directory for devices of this class */ if (!cls->dev_kobj) - cls->dev_kobj = sysfs_dev_char_kobj; + cls->dev_kobj = ve_sysfs_dev_char_kobj; -#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK) +#if defined(CONFIG_BLOCK) /* let the block class directory show up in the root of sysfs */ - if (cls != &block_class) - cp->class_subsys.kobj.kset = class_kset; + if (!sysfs_deprecated || cls != &block_class) + cp->class_subsys.kobj.kset = visible_class_kset; #else - cp->class_subsys.kobj.kset = class_kset; + cp->class_subsys.kobj.kset = visible_class_kset; #endif cp->class_subsys.kobj.ktype = &class_ktype; cp->class = cls; @@ -265,7 +273,6 @@ void class_destroy(struct class *cls) class_unregister(cls); } -#ifdef CONFIG_SYSFS_DEPRECATED char *make_class_name(const char *name, struct kobject *kobj) { char *class_name; @@ -282,7 +289,6 @@ char *make_class_name(const char *name, strcat(class_name, kobject_name(kobj)); return class_name; } -#endif /** * class_dev_iter_init - initialize class device iterator @@ -508,7 +514,7 @@ struct class_compat *class_compat_regist cls = kmalloc(sizeof(struct class_compat), GFP_KERNEL); if (!cls) return NULL; - cls->kobj = kobject_create_and_add(name, &class_kset->kobj); + cls->kobj = kobject_create_and_add(name, &visible_class_kset->kobj); if (!cls->kobj) { kfree(cls); return NULL; @@ -577,13 +583,20 @@ void class_compat_remove_link(struct cla } EXPORT_SYMBOL_GPL(class_compat_remove_link); -int __init classes_init(void) +int classes_init(void) { - class_kset = kset_create_and_add("class", NULL, NULL); - if (!class_kset) + visible_class_kset = kset_create_and_add("class", NULL, NULL); + if (!visible_class_kset) return -ENOMEM; return 0; } +EXPORT_SYMBOL_GPL(classes_init); + +void classes_fini(void) +{ + kset_unregister(visible_class_kset); +} +EXPORT_SYMBOL_GPL(classes_fini); EXPORT_SYMBOL_GPL(class_create_file); EXPORT_SYMBOL_GPL(class_remove_file); diff -urNp linux-2.6.32.48/drivers/base/core.c linux-2.6.32.48-openvz/drivers/base/core.c --- linux-2.6.32.48/drivers/base/core.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/base/core.c 2011-11-21 17:40:45.000000000 -0500 @@ -23,15 +23,22 @@ #include #include #include +#include +#include #include "base.h" #include "power/power.h" int (*platform_notify)(struct device *dev) = NULL; int (*platform_notify_remove)(struct device *dev) = NULL; +#ifndef CONFIG_VE static struct kobject *dev_kobj; +#define ve_dev_kobj dev_kobj struct kobject *sysfs_dev_char_kobj; struct kobject *sysfs_dev_block_kobj; +#else +#define ve_dev_kobj (get_exec_env()->dev_kobj) +#endif #ifdef CONFIG_BLOCK static inline int device_is_not_partition(struct device *dev) @@ -192,7 +199,9 @@ static int dev_uevent(struct kset *kset, if (dev->driver) add_uevent_var(env, "DRIVER=%s", dev->driver->name); -#ifdef CONFIG_SYSFS_DEPRECATED + if (!sysfs_deprecated) + goto skip; + if (dev->class) { struct device *parent = dev->parent; @@ -221,7 +230,7 @@ static int dev_uevent(struct kset *kset, add_uevent_var(env, "PHYSDEVDRIVER=%s", dev->driver->name); } -#endif +skip: /* have the bus specific function add its stuff */ if (dev->bus && dev->bus->uevent) { @@ -438,8 +447,9 @@ static ssize_t show_dev(struct device *d static struct device_attribute devt_attr = __ATTR(dev, S_IRUGO, show_dev, NULL); -/* kset to create /sys/devices/ */ +#ifndef CONFIG_VE struct kset *devices_kset; +#endif /** * device_create_file - create sysfs attribute file for device. @@ -557,7 +567,7 @@ static void klist_children_put(struct kl */ void device_initialize(struct device *dev) { - dev->kobj.kset = devices_kset; + dev->kobj.kset = ve_devices_kset; kobject_init(&dev->kobj, &device_ktype); INIT_LIST_HEAD(&dev->dma_pools); init_MUTEX(&dev->sem); @@ -568,8 +578,7 @@ void device_initialize(struct device *de set_dev_node(dev, -1); } -#ifdef CONFIG_SYSFS_DEPRECATED -static struct kobject *get_device_parent(struct device *dev, +static struct kobject *get_device_parent_dep(struct device *dev, struct device *parent) { /* class devices without a parent live in /sys/class// */ @@ -582,22 +591,25 @@ static struct kobject *get_device_parent return NULL; } -static inline void cleanup_device_parent(struct device *dev) {} -static inline void cleanup_glue_dir(struct device *dev, +static inline void cleanup_device_parent_dep(struct device *dev) {} +static inline void cleanup_glue_dir_dep(struct device *dev, struct kobject *glue_dir) {} +#ifndef CONFIG_VE +static struct kobject *virtual_dir = NULL; #else +# define virtual_dir (get_exec_env()->_virtual_dir) +#endif + static struct kobject *virtual_device_parent(struct device *dev) { - static struct kobject *virtual_dir = NULL; - if (!virtual_dir) virtual_dir = kobject_create_and_add("virtual", - &devices_kset->kobj); + &ve_devices_kset->kobj); return virtual_dir; } -static struct kobject *get_device_parent(struct device *dev, +static struct kobject *get_device_parent_nodep(struct device *dev, struct device *parent) { int retval; @@ -658,7 +670,7 @@ static struct kobject *get_device_parent return NULL; } -static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir) +static void cleanup_glue_dir_nodep(struct device *dev, struct kobject *glue_dir) { /* see if we live in a "glue" directory */ if (!glue_dir || !dev->class || @@ -668,11 +680,36 @@ static void cleanup_glue_dir(struct devi kobject_put(glue_dir); } +static void cleanup_device_parent_nodep(struct device *dev) +{ + cleanup_glue_dir_nodep(dev, dev->kobj.parent); +} + +static struct kobject *get_device_parent(struct device *dev, + struct device *parent) +{ + if (sysfs_deprecated) + return get_device_parent_dep(dev, parent); + else + return get_device_parent_nodep(dev, parent); +} + +static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir) +{ + if (sysfs_deprecated) + cleanup_glue_dir_dep(dev, glue_dir); + else + cleanup_glue_dir_nodep(dev, glue_dir); +} + static void cleanup_device_parent(struct device *dev) { - cleanup_glue_dir(dev, dev->kobj.parent); + if (sysfs_deprecated) + cleanup_device_parent_dep(dev); + else + cleanup_device_parent_nodep(dev); } -#endif + static void setup_parent(struct device *dev, struct device *parent) { @@ -695,7 +732,9 @@ static int device_add_class_symlinks(str if (error) goto out; -#ifdef CONFIG_SYSFS_DEPRECATED + if (!sysfs_deprecated) + goto nodep; + /* stacked class devices need a symlink in the class directory */ if (dev->kobj.parent != &dev->class->p->class_subsys.kobj && device_is_not_partition(dev)) { @@ -720,7 +759,7 @@ static int device_add_class_symlinks(str &parent->kobj, "device"); if (error) - goto out_busid; + goto out_busid_dep; class_name = make_class_name(dev->class->name, &dev->kobj); @@ -736,12 +775,14 @@ static int device_add_class_symlinks(str out_device: if (dev->parent && device_is_not_partition(dev)) sysfs_remove_link(&dev->kobj, "device"); -out_busid: +out_busid_dep: if (dev->kobj.parent != &dev->class->p->class_subsys.kobj && device_is_not_partition(dev)) sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev)); -#else + goto out_subsys; + +nodep: /* link in the class directory pointing to the device */ error = sysfs_create_link(&dev->class->p->class_subsys.kobj, &dev->kobj, dev_name(dev)); @@ -752,14 +793,12 @@ out_busid: error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, "device"); if (error) - goto out_busid; + goto out_busid_nodep; } return 0; -out_busid: +out_busid_nodep: sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev)); -#endif - out_subsys: sysfs_remove_link(&dev->kobj, "subsystem"); out: @@ -771,7 +810,9 @@ static void device_remove_class_symlinks if (!dev->class) return; -#ifdef CONFIG_SYSFS_DEPRECATED + if (!sysfs_deprecated) + goto nodep; + if (dev->parent && device_is_not_partition(dev)) { char *class_name; @@ -787,13 +828,14 @@ static void device_remove_class_symlinks device_is_not_partition(dev)) sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev)); -#else + goto done; + +nodep: if (dev->parent && device_is_not_partition(dev)) sysfs_remove_link(&dev->kobj, "device"); sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev)); -#endif - +done: sysfs_remove_link(&dev->kobj, "subsystem"); } @@ -832,7 +874,7 @@ static struct kobject *device_to_dev_kob if (dev->class) kobj = dev->class->dev_kobj; else - kobj = sysfs_dev_char_kobj; + kobj = ve_sysfs_dev_char_kobj; return kobj; } @@ -1270,31 +1312,43 @@ struct device *device_find_child(struct return child; } -int __init devices_init(void) +int devices_init(void) { - devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); - if (!devices_kset) - return -ENOMEM; - dev_kobj = kobject_create_and_add("dev", NULL); - if (!dev_kobj) + ve_devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); + if (!ve_devices_kset) + goto dev_kset_err; + ve_dev_kobj = kobject_create_and_add("dev", NULL); + if (!ve_dev_kobj) goto dev_kobj_err; - sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj); - if (!sysfs_dev_block_kobj) + ve_sysfs_dev_block_kobj = kobject_create_and_add("block", ve_dev_kobj); + if (!ve_sysfs_dev_block_kobj) goto block_kobj_err; - sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); - if (!sysfs_dev_char_kobj) + ve_sysfs_dev_char_kobj = kobject_create_and_add("char", ve_dev_kobj); + if (!ve_sysfs_dev_char_kobj) goto char_kobj_err; return 0; char_kobj_err: - kobject_put(sysfs_dev_block_kobj); + kobject_put(ve_sysfs_dev_block_kobj); block_kobj_err: - kobject_put(dev_kobj); + kobject_put(ve_dev_kobj); dev_kobj_err: - kset_unregister(devices_kset); + kset_unregister(ve_devices_kset); +dev_kset_err: return -ENOMEM; } +EXPORT_SYMBOL_GPL(devices_init); + +void devices_fini(void) +{ + kobject_put(ve_sysfs_dev_char_kobj); + kobject_put(ve_sysfs_dev_block_kobj); + kobject_put(ve_dev_kobj); + kset_unregister(ve_devices_kset); +} +EXPORT_SYMBOL_GPL(devices_fini); + EXPORT_SYMBOL_GPL(device_for_each_child); EXPORT_SYMBOL_GPL(device_find_child); @@ -1556,10 +1610,8 @@ int device_rename(struct device *dev, ch pr_debug("device: '%s': %s: renaming to '%s'\n", dev_name(dev), __func__, new_name); -#ifdef CONFIG_SYSFS_DEPRECATED - if ((dev->class) && (dev->parent)) + if (sysfs_deprecated && (dev->class) && (dev->parent)) old_class_name = make_class_name(dev->class->name, &dev->kobj); -#endif old_device_name = kstrdup(dev_name(dev), GFP_KERNEL); if (!old_device_name) { @@ -1571,8 +1623,7 @@ int device_rename(struct device *dev, ch if (error) goto out; -#ifdef CONFIG_SYSFS_DEPRECATED - if (old_class_name) { + if (sysfs_deprecated && old_class_name) { new_class_name = make_class_name(dev->class->name, &dev->kobj); if (new_class_name) { error = sysfs_create_link_nowarn(&dev->parent->kobj, @@ -1583,8 +1634,7 @@ int device_rename(struct device *dev, ch sysfs_remove_link(&dev->parent->kobj, old_class_name); } } -#else - if (dev->class) { + if (!sysfs_deprecated && dev->class) { error = sysfs_create_link_nowarn(&dev->class->p->class_subsys.kobj, &dev->kobj, dev_name(dev)); if (error) @@ -1592,7 +1642,6 @@ int device_rename(struct device *dev, ch sysfs_remove_link(&dev->class->p->class_subsys.kobj, old_device_name); } -#endif out: put_device(dev); @@ -1610,9 +1659,11 @@ static int device_move_class_links(struc struct device *new_parent) { int error = 0; -#ifdef CONFIG_SYSFS_DEPRECATED char *class_name; + if (!sysfs_deprecated) + goto nodep; + class_name = make_class_name(dev->class->name, &dev->kobj); if (!class_name) { error = -ENOMEM; @@ -1636,14 +1687,14 @@ static int device_move_class_links(struc out: kfree(class_name); return error; -#else + +nodep: if (old_parent) sysfs_remove_link(&dev->kobj, "device"); if (new_parent) error = sysfs_create_link(&dev->kobj, &new_parent->kobj, "device"); return error; -#endif } /** @@ -1734,7 +1785,12 @@ void device_shutdown(void) { struct device *dev, *devn; - list_for_each_entry_safe_reverse(dev, devn, &devices_kset->list, + if (!ve_is_super(get_exec_env())) { + printk("BUG: device_shutdown call from inside VE\n"); + return; + } + + list_for_each_entry_safe_reverse(dev, devn, &ve_devices_kset->list, kobj.entry) { if (dev->bus && dev->bus->shutdown) { dev_dbg(dev, "shutdown\n"); @@ -1744,8 +1800,9 @@ void device_shutdown(void) dev->driver->shutdown(dev); } } - kobject_put(sysfs_dev_char_kobj); - kobject_put(sysfs_dev_block_kobj); - kobject_put(dev_kobj); + + kobject_put(ve_sysfs_dev_char_kobj); + kobject_put(ve_sysfs_dev_block_kobj); + kobject_put(ve_dev_kobj); async_synchronize_full(); } diff -urNp linux-2.6.32.48/drivers/base/Kconfig linux-2.6.32.48-openvz/drivers/base/Kconfig --- linux-2.6.32.48/drivers/base/Kconfig 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/base/Kconfig 2011-11-21 17:40:45.000000000 -0500 @@ -10,7 +10,7 @@ config UEVENT_HELPER_PATH config DEVTMPFS bool "Create a kernel maintained /dev tmpfs (EXPERIMENTAL)" - depends on HOTPLUG && SHMEM && TMPFS + depends on HOTPLUG && SHMEM && TMPFS && !VE help This creates a tmpfs filesystem, and mounts it at bootup and mounts it at /dev. The kernel driver core creates device diff -urNp linux-2.6.32.48/drivers/base/sys.c linux-2.6.32.48-openvz/drivers/base/sys.c --- linux-2.6.32.48/drivers/base/sys.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/base/sys.c 2011-11-21 17:40:45.000000000 -0500 @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include #include @@ -494,7 +496,7 @@ EXPORT_SYMBOL_GPL(sysdev_resume); int __init system_bus_init(void) { - system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); + system_kset = kset_create_and_add("system", NULL, &ve_devices_kset->kobj); if (!system_kset) return -ENOMEM; return 0; diff -urNp linux-2.6.32.48/drivers/char/Kconfig linux-2.6.32.48-openvz/drivers/char/Kconfig --- linux-2.6.32.48/drivers/char/Kconfig 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/char/Kconfig 2011-11-21 17:40:45.000000000 -0500 @@ -458,7 +458,7 @@ config UNIX98_PTYS config DEVPTS_MULTIPLE_INSTANCES bool "Support multiple instances of devpts" - depends on UNIX98_PTYS + depends on UNIX98_PTYS && !VE default n ---help--- Enable support for multiple instances of devpts filesystem. diff -urNp linux-2.6.32.48/drivers/char/keyboard.c linux-2.6.32.48-openvz/drivers/char/keyboard.c --- linux-2.6.32.48/drivers/char/keyboard.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/char/keyboard.c 2011-11-21 17:40:45.000000000 -0500 @@ -162,6 +162,7 @@ unsigned char kbd_sysrq_xlate[KEY_MAX + static int sysrq_down; static int sysrq_alt_use; #endif +int sysrq_key_scancode = KEY_SYSRQ; static int sysrq_alt; /* @@ -1067,6 +1068,9 @@ static int emulate_raw(struct vc_data *v { int code; + if (keycode == sysrq_key_scancode && sysrq_alt) + goto sysrq; + switch (keycode) { case KEY_PAUSE: put_queue(vc, 0xe1); @@ -1085,6 +1089,7 @@ static int emulate_raw(struct vc_data *v break; case KEY_SYSRQ: +sysrq: /* * Real AT keyboards (that's what we're trying * to emulate here emit 0xe0 0x2a 0xe0 0x37 when @@ -1179,7 +1184,8 @@ static void kbd_keycode(unsigned int key printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode); #ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */ - if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) { + if ((keycode == sysrq_key_scancode || keycode == KEY_SYSRQ) && + (sysrq_down || (down == 1 && sysrq_alt))) { if (!sysrq_down) { sysrq_down = down; sysrq_alt_use = sysrq_alt; diff -urNp linux-2.6.32.48/drivers/char/pty.c linux-2.6.32.48-openvz/drivers/char/pty.c --- linux-2.6.32.48/drivers/char/pty.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/char/pty.c 2011-11-21 17:40:45.000000000 -0500 @@ -30,16 +30,22 @@ #include #include +#include + #include #ifdef CONFIG_UNIX98_PTYS -static struct tty_driver *ptm_driver; -static struct tty_driver *pts_driver; +struct tty_driver *ptm_driver; +struct tty_driver *pts_driver; +EXPORT_SYMBOL(ptm_driver); +EXPORT_SYMBOL(pts_driver); #endif static void pty_close(struct tty_struct *tty, struct file *filp) { BUG_ON(!tty); + + ub_pty_uncharge(tty); if (tty->driver->subtype == PTY_TYPE_MASTER) WARN_ON(tty->count > 1); else { @@ -58,8 +64,12 @@ static void pty_close(struct tty_struct if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS - if (tty->driver == ptm_driver) + if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) { + struct ve_struct *old_env; + old_env = set_exec_env(tty->owner_env); devpts_pty_kill(tty->link); + (void)set_exec_env(old_env); + } #endif tty_vhangup(tty->link); } @@ -201,6 +211,10 @@ static int pty_open(struct tty_struct *t if (tty->link->count != 1) goto out; + retval = -ENOMEM; + if (ub_pty_charge(tty)) + goto out; + clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); retval = 0; @@ -358,9 +372,12 @@ static const struct tty_operations slave .resize = pty_resize }; +struct tty_driver *pty_driver, *pty_slave_driver; +EXPORT_SYMBOL(pty_driver); +EXPORT_SYMBOL(pty_slave_driver); + static void __init legacy_pty_init(void) { - struct tty_driver *pty_driver, *pty_slave_driver; if (legacy_count <= 0) return; @@ -645,7 +662,7 @@ static int __ptmx_open(struct inode *ino return index; mutex_lock(&tty_mutex); - tty = tty_init_dev(ptm_driver, index, 1); + tty = tty_init_dev(get_exec_env()->ptm_driver, index, NULL, 1); mutex_unlock(&tty_mutex); if (IS_ERR(tty)) { @@ -661,7 +678,7 @@ static int __ptmx_open(struct inode *ino if (retval) goto out1; - retval = ptm_driver->ops->open(tty, filp); + retval = get_exec_env()->ptm_driver->ops->open(tty, filp); if (!retval) return 0; out1: @@ -744,6 +761,9 @@ static void __init unix98_pty_init(void) register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) panic("Couldn't register /dev/ptmx driver\n"); device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); +#ifdef CONFIG_VE + get_ve0()->ptm_driver = ptm_driver; +#endif } #else diff -urNp linux-2.6.32.48/drivers/char/sysrq.c linux-2.6.32.48-openvz/drivers/char/sysrq.c --- linux-2.6.32.48/drivers/char/sysrq.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/char/sysrq.c 2011-11-21 17:40:45.000000000 -0500 @@ -37,7 +37,10 @@ #include #include #include +#include +#include #include +#include #include #include @@ -250,8 +253,8 @@ static struct sysrq_key_op sysrq_showall static void sysrq_handle_showregs(int key, struct tty_struct *tty) { struct pt_regs *regs = get_irq_regs(); - if (regs) - show_regs(regs); + + nmi_show_regs(regs, 0); perf_event_print_debug(); } static struct sysrq_key_op sysrq_showregs_op = { @@ -303,6 +306,7 @@ static struct sysrq_key_op sysrq_ftrace_ static void sysrq_handle_showmem(int key, struct tty_struct *tty) { show_mem(); + show_slab_info(); } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, @@ -318,7 +322,7 @@ static void send_sig_all(int sig) { struct task_struct *p; - for_each_process(p) { + for_each_process_all(p) { if (p->mm && !is_global_init(p)) /* Not swapper, init nor kernel thread */ force_sig(sig, p); @@ -394,7 +398,267 @@ static struct sysrq_key_op sysrq_unrt_op /* Key Operations table and lock */ static DEFINE_SPINLOCK(sysrq_key_table_lock); -static struct sysrq_key_op *sysrq_key_table[36] = { +#define SYSRQ_KEY_TABLE_LENGTH 37 +static struct sysrq_key_op **sysrq_key_table; +static struct sysrq_key_op *sysrq_default_key_table[]; + +#ifdef CONFIG_SYSRQ_DEBUG +#define SYSRQ_NAMELEN_MAX 64 +#define SYSRQ_DUMP_LINES 32 + +static struct sysrq_key_op *sysrq_debug_key_table[]; +static struct sysrq_key_op *sysrq_input_key_table[]; +static unsigned long *dump_address; +static int orig_console_loglevel; +static void (*sysrq_input_return)(char *) = NULL; + +static void dump_mem(void) +{ + unsigned long value[4]; + mm_segment_t old_fs; + int line, err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = 0; + + for (line = 0; line < SYSRQ_DUMP_LINES; line++) { + err |= __get_user(value[0], dump_address++); + err |= __get_user(value[1], dump_address++); + err |= __get_user(value[2], dump_address++); + err |= __get_user(value[3], dump_address++); + if (err) { + printk("Invalid address %p\n", dump_address - 4); + break; + } +#if BITS_PER_LONG == 32 + printk("0x%p: %08lx %08lx %08lx %08lx\n", + dump_address - 4, + value[0], value[1], value[2], value[3]); +#else + printk("0x%p: %016lx %016lx %016lx %016lx\n", + dump_address - 4, + value[0], value[1], value[2], value[3]); +#endif + } + set_fs(old_fs); +} + +static void write_mem(unsigned long val) +{ + mm_segment_t old_fs; + unsigned long old_val; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (__get_user(old_val, dump_address)) { + printk("Invalid address %p\n", dump_address); + goto out; + } + +#if BITS_PER_LONG == 32 + printk("Changing [%p] from %08lx to %08lx\n", + dump_address, old_val, val); +#else + printk("Changing [%p] from %016lx to %016lx\n", + dump_address, old_val, val); +#endif + __put_user(val, dump_address); +out: + set_fs(old_fs); +} + +static void handle_read(int key, struct tty_struct *tty) +{ + static int pos; + static int upper_case; + static char str[SYSRQ_NAMELEN_MAX]; + + if (key == 0) { + /* actually 0 is not shift only... */ + upper_case = 1; + return; + } + + if (key == 0x0d || pos == SYSRQ_NAMELEN_MAX - 1) { + /* enter */ + sysrq_key_table = sysrq_debug_key_table; + str[pos] = '\0'; + pos = upper_case = 0; + printk("\n"); + if (sysrq_input_return == NULL) + printk("No return handler!!!\n"); + else + sysrq_input_return(str); + return; + }; + + /* check for alowed symbols */ + if (key == '-') { + if (upper_case) + key = '_'; + goto correct; + }; + if (key >= 'a' && key <= 'z') { + if (upper_case) + key = key - 'a' + 'A'; + goto correct; + }; + if (key >= '0' && key <= '9') + goto correct; + + upper_case = 0; + return; + +correct: + str[pos] = key; + printk("%c", (char)key); + pos++; + upper_case = 0; +} + +static struct sysrq_key_op input_read = { + .handler = handle_read, + .help_msg = "", + .action_msg = NULL, +}; + +static struct sysrq_key_op *sysrq_input_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + [0 ... SYSRQ_KEY_TABLE_LENGTH - 1] = &input_read, +}; + +static void return_dump_mem(char *str) +{ + unsigned long address; + char *end; + + address = simple_strtoul(str, &end, 0); + if (*end != '\0') { + printk("Bad address [%s]\n", str); + return; + } + + dump_address = (unsigned long *)address; + dump_mem(); +} + +static void handle_dump_mem(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_dump_mem; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_dump_mem = { + .handler = handle_dump_mem, + .help_msg = "Dump", + .action_msg = "Enter address:", +}; + +static void return_resolve(char *str) +{ + unsigned long address; + + address = kallsyms_lookup_name(str); + printk("%s : %lx\n", str, address); + if (address) { + dump_address = (unsigned long *)address; + printk("Now you can dump it via X\n"); + } +} + +static void handle_resolve(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_resolve; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_resolve = { + .handler = handle_resolve, + .help_msg = "Resolve", + .action_msg = "Enter symbol name:", +}; + +static void return_write_mem(char *str) +{ + unsigned long address; + unsigned long value; + char *end; + + address = simple_strtoul(str, &end, 0); + if (*end != '-') { + printk("Bad address in %s\n", str); + return; + } + value = simple_strtoul(end + 1, &end, 0); + if (*end != '\0') { + printk("Bad value in %s\n", str); + return; + } + + dump_address = (unsigned long *)address; + write_mem(value); +} + +static void handle_write_mem(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_write_mem; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_write_mem = { + .handler = handle_write_mem, + .help_msg = "Writemem", + .action_msg = "Enter address-value:", +}; + +static void handle_next(int key, struct tty_struct *tty) +{ + dump_mem(); +} + +static struct sysrq_key_op debug_next = { + .handler = handle_next, + .help_msg = "neXt", + .action_msg = "continuing", +}; + +static void handle_quit(int key, struct tty_struct *tty) +{ + sysrq_key_table = sysrq_default_key_table; + console_loglevel = orig_console_loglevel; +} + +static struct sysrq_key_op debug_quit = { + .handler = handle_quit, + .help_msg = "Quit", + .action_msg = "Thank you for using debugger", +}; + +static struct sysrq_key_op *sysrq_debug_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + [13] = &debug_dump_mem, /* d */ + [26] = &debug_quit, /* q */ + [27] = &debug_resolve, /* r */ + [32] = &debug_write_mem, /* w */ + [33] = &debug_next, /* x */ +}; + +static void sysrq_handle_debug(int key, struct tty_struct *tty) +{ + orig_console_loglevel = console_loglevel; + console_loglevel = 8; + sysrq_key_table = sysrq_debug_key_table; + printk("Welcome sysrq debugging mode\n" + "Press H for help\n"); +} + +static struct sysrq_key_op sysrq_debug_op = { + .handler = sysrq_handle_debug, + .help_msg = "debuG", + .action_msg = "Select desired action", +}; +#endif + +static struct sysrq_key_op *sysrq_default_key_table[SYSRQ_KEY_TABLE_LENGTH] = { &sysrq_loglevel_op, /* 0 */ &sysrq_loglevel_op, /* 1 */ &sysrq_loglevel_op, /* 2 */ @@ -417,7 +681,11 @@ static struct sysrq_key_op *sysrq_key_ta &sysrq_term_op, /* e */ &sysrq_moom_op, /* f */ /* g: May be registered for the kernel debugger */ +#ifdef CONFIG_SYSRQ_DEBUG + &sysrq_debug_op, /* g */ +#else NULL, /* g */ +#endif NULL, /* h - reserved for help */ &sysrq_kill_op, /* i */ #ifdef CONFIG_BLOCK @@ -449,8 +717,11 @@ static struct sysrq_key_op *sysrq_key_ta /* y: May be registered on sparc64 for global register dump */ NULL, /* y */ &sysrq_ftrace_dump_op, /* z */ + NULL, /* for debugger */ }; +static struct sysrq_key_op **sysrq_key_table = sysrq_default_key_table; + /* key2index calculation, -1 on invalid index */ static int sysrq_key_table_key2index(int key) { @@ -460,6 +731,10 @@ static int sysrq_key_table_key2index(int retval = key - '0'; else if ((key >= 'a') && (key <= 'z')) retval = key + 10 - 'a'; +#ifdef CONFIG_SYSRQ_DEBUG + else if (key == 0 || key == 0x0d || key == '-') + retval = SYSRQ_KEY_TABLE_LENGTH - 1; +#endif else retval = -1; return retval; @@ -470,21 +745,21 @@ static int sysrq_key_table_key2index(int */ struct sysrq_key_op *__sysrq_get_key_op(int key) { - struct sysrq_key_op *op_p = NULL; - int i; + struct sysrq_key_op *op_p = NULL; + int i; i = sysrq_key_table_key2index(key); if (i != -1) - op_p = sysrq_key_table[i]; - return op_p; + op_p = sysrq_key_table[i]; + return op_p; } static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) { - int i = sysrq_key_table_key2index(key); + int i = sysrq_key_table_key2index(key); - if (i != -1) - sysrq_key_table[i] = op_p; + if (i != -1) + sysrq_key_table[i] = op_p; } /* @@ -507,25 +782,25 @@ void __handle_sysrq(int key, struct tty_ */ orig_log_level = console_loglevel; console_loglevel = 7; - printk(KERN_INFO "SysRq : "); - op_p = __sysrq_get_key_op(key); - if (op_p) { + op_p = __sysrq_get_key_op(key); + if (op_p) { /* * Should we check for enabled operations (/proc/sysrq-trigger * should not) and is the invoked operation enabled? */ if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { - printk("%s\n", op_p->action_msg); + if (op_p->action_msg) + printk("%s\n", op_p->action_msg); console_loglevel = orig_log_level; op_p->handler(key, tty); } else { printk("This sysrq operation is disabled.\n"); } } else { - printk("HELP : "); + printk("SysRq HELP : "); /* Only print the help msg once per handler */ - for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) { + for (i = 0; i < SYSRQ_KEY_TABLE_LENGTH; i++) { if (sysrq_key_table[i]) { int j; @@ -555,7 +830,7 @@ void handle_sysrq(int key, struct tty_st EXPORT_SYMBOL(handle_sysrq); static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p, - struct sysrq_key_op *remove_op_p) + struct sysrq_key_op *remove_op_p) { int retval; @@ -591,12 +866,29 @@ EXPORT_SYMBOL(unregister_sysrq_key); static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { + struct ve_struct *cur = get_exec_env(); + static int pnum = 10; + if (count) { - char c; + int i, cnt; + char c[32]; - if (get_user(c, buf)) + cnt = min(count, sizeof(c)); + if (copy_from_user(c, buf, cnt)) return -EFAULT; - __handle_sysrq(c, NULL, 0); + + + for (i = 0; i < cnt && c[i] != '\n'; i++) { + if (!ve_is_super(cur)) { + if (!pnum) + continue; + printk("SysRq: CT#%u sent '%c' magic key.\n", + cur->veid, c[i]); + pnum--; + continue; + } + __handle_sysrq(c[i], NULL, 0); + } } return count; } @@ -607,7 +899,7 @@ static const struct file_operations proc static int __init sysrq_init(void) { - proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); + proc_create("sysrq-trigger", S_IWUSR, &glob_proc_root, &proc_sysrq_trigger_operations); return 0; } module_init(sysrq_init); diff -urNp linux-2.6.32.48/drivers/char/tty_io.c linux-2.6.32.48-openvz/drivers/char/tty_io.c --- linux-2.6.32.48/drivers/char/tty_io.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/char/tty_io.c 2011-11-21 17:40:45.000000000 -0500 @@ -96,6 +96,8 @@ #include #include #include +#include +#include #include #include @@ -106,6 +108,7 @@ #include #include +#include #undef TTY_DEBUG_HANGUP @@ -130,6 +133,7 @@ EXPORT_SYMBOL(tty_std_termios); into this file */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ +EXPORT_SYMBOL(tty_drivers); /* Mutex to protect creating and releasing a tty. This is shared with vt.c for deeply disgusting hack reasons */ @@ -166,7 +170,7 @@ static void proc_set_tty(struct task_str struct tty_struct *alloc_tty_struct(void) { - return kzalloc(sizeof(struct tty_struct), GFP_KERNEL); + return kzalloc(sizeof(struct tty_struct), GFP_KERNEL_UBC); } /** @@ -274,9 +278,29 @@ static struct tty_driver *get_tty_driver if (device < base || device >= base + p->num) continue; *index = device - base; - return tty_driver_kref_get(p); +#ifdef CONFIG_VE + if (in_interrupt()) + goto found; + if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR +#ifdef CONFIG_UNIX98_PTYS + && (p->majormajor>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) && + (p->majormajor>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) +#endif + ) + goto found; + if (ve_is_super(p->owner_env) && ve_is_super(get_exec_env())) + goto found; + if (!ve_accessible_strict(p->owner_env, get_exec_env())) + continue; +#endif + goto found; } return NULL; + +found: + return tty_driver_kref_get(p); } #ifdef CONFIG_CONSOLE_POLL @@ -1169,7 +1193,7 @@ int tty_init_termios(struct tty_struct * tp = tty->driver->termios[idx]; if (tp == NULL) { - tp = kzalloc(sizeof(struct ktermios[2]), GFP_KERNEL); + tp = kzalloc(sizeof(struct ktermios[2]), GFP_KERNEL_UBC); if (tp == NULL) return -ENOMEM; memcpy(tp, &tty->driver->init_termios, @@ -1297,7 +1321,7 @@ static int tty_reopen(struct tty_struct */ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx, - int first_ok) + struct tty_struct *i_tty, int first_ok) { struct tty_struct *tty; int retval; @@ -1707,7 +1731,7 @@ void tty_release_dev(struct file *filp) static int __tty_open(struct inode *inode, struct file *filp) { - struct tty_struct *tty = NULL; + struct tty_struct *tty = NULL, *c_tty = NULL; int noctty, retval; struct tty_driver *driver; int index; @@ -1731,6 +1755,7 @@ retry_open: } driver = tty_driver_kref_get(tty->driver); index = tty->index; + c_tty = tty; filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ /* FIXME: Should we take a driver reference ? */ @@ -1740,6 +1765,12 @@ retry_open: #ifdef CONFIG_VT if (device == MKDEV(TTY_MAJOR, 0)) { extern struct tty_driver *console_driver; +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + mutex_unlock(&tty_mutex); + return -ENODEV; + } +#endif driver = tty_driver_kref_get(console_driver); index = fg_console; noctty = 1; @@ -1748,6 +1779,12 @@ retry_open: #endif if (device == MKDEV(TTYAUX_MAJOR, 1)) { struct tty_driver *console_driver = console_device(&index); +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + mutex_unlock(&tty_mutex); + return -ENODEV; + } +#endif if (console_driver) { driver = tty_driver_kref_get(console_driver); if (driver) { @@ -1782,7 +1819,7 @@ got_driver: if (retval) tty = ERR_PTR(retval); } else - tty = tty_init_dev(driver, index, 0); + tty = tty_init_dev(driver, index, c_tty, 0); mutex_unlock(&tty_mutex); tty_driver_kref_put(driver); @@ -2078,6 +2115,8 @@ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!ve_is_super(get_exec_env())) + return -EACCES; if (file->f_op->write == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); @@ -2658,7 +2697,7 @@ void __do_SAK(struct tty_struct *tty) /* Now kill any processes that happen to have the * tty open. */ - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): task_session(p)==tty->session\n", @@ -2690,7 +2729,7 @@ void __do_SAK(struct tty_struct *tty) spin_unlock(&p->files->file_lock); } task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); #endif } @@ -2757,6 +2796,7 @@ void initialize_tty_struct(struct tty_st tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); + tty->owner_env = driver->owner_env; } /** @@ -2849,6 +2889,7 @@ struct tty_driver *alloc_tty_driver(int driver->magic = TTY_DRIVER_MAGIC; driver->num = lines; /* later we'll move allocation of tables here */ + driver->owner_env = get_ve(get_exec_env()); } return driver; } @@ -2883,6 +2924,7 @@ static void destruct_tty_driver(struct k kfree(p); cdev_del(&driver->cdev); } + put_ve(driver->owner_env); kfree(driver); } @@ -2957,6 +2999,7 @@ int tty_register_driver(struct tty_drive } mutex_lock(&tty_mutex); + driver->owner_env = get_exec_env(); list_add(&driver->tty_drivers, &tty_drivers); mutex_unlock(&tty_mutex); @@ -3130,3 +3173,43 @@ static int __init tty_init(void) return 0; } module_init(tty_init); + +#ifdef CONFIG_UNIX98_PTYS +int init_ve_tty_class(void) +{ + struct class * ve_tty_class; + struct device * ve_ptmx_dev_class; + + ve_tty_class = class_create(THIS_MODULE, "tty"); + if (IS_ERR(ve_tty_class)) + return -ENOMEM; + + ve_ptmx_dev_class = device_create(ve_tty_class, NULL, + MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); + if (IS_ERR(ve_ptmx_dev_class)) { + class_destroy(ve_tty_class); + return PTR_ERR(ve_ptmx_dev_class); + } + + get_exec_env()->tty_class = ve_tty_class; + return 0; +} + +void fini_ve_tty_class(void) +{ + struct class *ve_tty_class = get_exec_env()->tty_class; + + device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 2)); + class_destroy(ve_tty_class); +} +#else +int init_ve_tty_class(void) +{ + return 0; +} +void fini_ve_tty_class(void) +{ +} +#endif +EXPORT_SYMBOL(init_ve_tty_class); +EXPORT_SYMBOL(fini_ve_tty_class); diff -urNp linux-2.6.32.48/drivers/char/vc_screen.c linux-2.6.32.48-openvz/drivers/char/vc_screen.c --- linux-2.6.32.48/drivers/char/vc_screen.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/char/vc_screen.c 2011-11-21 17:40:45.000000000 -0500 @@ -35,6 +35,8 @@ #include #include #include +#include + #include #include @@ -481,16 +483,22 @@ static struct class *vc_class; void vcs_make_sysfs(int index) { + struct ve_struct *ve = set_exec_env(get_ve0()); + device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 1), NULL, "vcs%u", index + 1); device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 129), NULL, "vcsa%u", index + 1); + set_exec_env(ve); } void vcs_remove_sysfs(int index) { + struct ve_struct *ve = set_exec_env(get_ve0()); + device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 1)); device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 129)); + set_exec_env(ve); } int __init vcs_init(void) diff -urNp linux-2.6.32.48/drivers/net/loopback.c linux-2.6.32.48-openvz/drivers/net/loopback.c --- linux-2.6.32.48/drivers/net/loopback.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/loopback.c 2011-11-21 17:40:45.000000000 -0500 @@ -75,6 +75,12 @@ static netdev_tx_t loopback_xmit(struct struct pcpu_lstats *pcpu_lstats, *lb_stats; int len; +#ifdef CONFIG_VE + if (unlikely(get_exec_env()->disable_net)) { + kfree_skb(skb); + return 0; + } +#endif skb_orphan(skb); skb->protocol = eth_type_trans(skb, dev); @@ -153,10 +159,16 @@ static void loopback_dev_free(struct net free_netdev(dev); } +static void loopback_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx) +{ +} + static const struct net_device_ops loopback_ops = { .ndo_init = loopback_dev_init, .ndo_start_xmit= loopback_xmit, .ndo_get_stats = loopback_get_stats, + .ndo_cpt = loopback_cpt, }; /* @@ -177,7 +189,8 @@ static void loopback_setup(struct net_de | NETIF_F_NO_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX - | NETIF_F_NETNS_LOCAL; + | NETIF_F_NETNS_LOCAL + | NETIF_F_VIRTUAL; dev->ethtool_ops = &loopback_ethtool_ops; dev->header_ops = ð_header_ops; dev->netdev_ops = &loopback_ops; diff -urNp linux-2.6.32.48/drivers/net/Makefile linux-2.6.32.48-openvz/drivers/net/Makefile --- linux-2.6.32.48/drivers/net/Makefile 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/Makefile 2011-11-21 17:40:45.000000000 -0500 @@ -41,6 +41,10 @@ ucc_geth_driver-objs := ucc_geth.o ucc_g obj-$(CONFIG_FSL_PQ_MDIO) += fsl_pq_mdio.o +obj-$(CONFIG_VE_NETDEV) += vznetdev.o +vznetdev-objs := open_vznet.o venet_core.o +obj-$(CONFIG_VE_ETHDEV) += vzethdev.o + # # link order important here # diff -urNp linux-2.6.32.48/drivers/net/open_vznet.c linux-2.6.32.48-openvz/drivers/net/open_vznet.c --- linux-2.6.32.48/drivers/net/open_vznet.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/open_vznet.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,244 @@ +/* + * open_vznet.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Virtual Networking device used to change VE ownership on packets + */ + +#include +#include +#include + +#include +#include +#include +#include + +void veip_stop(struct ve_struct *ve) +{ + struct list_head *p, *tmp; + + write_lock_irq(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; + list_for_each_safe(p, tmp, &ve->veip->ip_lh) { + struct ip_entry_struct *ptr; + ptr = list_entry(p, struct ip_entry_struct, ve_list); + ptr->active_env = NULL; + list_del(&ptr->ve_list); + list_del(&ptr->ip_hash); + kfree(ptr); + } + veip_put(ve->veip); + ve->veip = NULL; + if (!ve_is_super(ve)) + module_put(THIS_MODULE); +unlock: + write_unlock_irq(&veip_hash_lock); +} + +int veip_start(struct ve_struct *ve) +{ + int err, get; + + err = 0; + write_lock_irq(&veip_hash_lock); + get = ve->veip == NULL; + ve->veip = veip_findcreate(ve->veid); + if (ve->veip == NULL) + err = -ENOMEM; + write_unlock_irq(&veip_hash_lock); + if (err == 0 && get && !ve_is_super(ve)) + __module_get(THIS_MODULE); + return err; +} + +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr) +{ + struct ip_entry_struct *entry, *found; + int err; + + entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + + if (ve->veip == NULL) { + /* This can happen if we load venet AFTER ve was started */ + err = veip_start(ve); + if (err < 0) + goto out; + } + + write_lock_irq(&veip_hash_lock); + err = -EADDRINUSE; + found = venet_entry_lookup(addr); + if (found != NULL) + goto out_unlock; + + entry->active_env = ve; + entry->addr = *addr; + ip_entry_hash(entry, ve->veip); + + err = 0; + entry = NULL; +out_unlock: + write_unlock_irq(&veip_hash_lock); +out: + if (entry != NULL) + kfree(entry); + return err; +} + +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr) +{ + struct ip_entry_struct *found; + int err; + + err = -EADDRNOTAVAIL; + write_lock_irq(&veip_hash_lock); + found = venet_entry_lookup(addr); + if (found == NULL) + goto out; + if (found->active_env->veid != veid) + goto out; + + err = 0; + found->active_env = NULL; + + list_del(&found->ip_hash); + list_del(&found->ve_list); + kfree(found); +out: + write_unlock_irq(&veip_hash_lock); + return err; +} + +static int skb_extract_addr(struct sk_buff *skb, + struct ve_addr_struct *addr, int dir) +{ + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + addr->family = AF_INET; + addr->key[0] = 0; + addr->key[1] = 0; + addr->key[2] = 0; + addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr); + return 0; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case __constant_htons(ETH_P_IPV6): + addr->family = AF_INET6; + memcpy(&addr->key, dir ? + ipv6_hdr(skb)->daddr.s6_addr32 : + ipv6_hdr(skb)->saddr.s6_addr32, + sizeof(addr->key)); + return 0; +#endif + } + + return -EAFNOSUPPORT; +} + +static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir) +{ + struct ip_entry_struct *entry; + struct ve_addr_struct addr; + + if (skb_extract_addr(skb, &addr, dir) < 0) + return NULL; + + entry = venet_entry_lookup(&addr); + if (entry == NULL) + return NULL; + + return entry->active_env; +} + +int venet_change_skb_owner(struct sk_buff *skb) +{ + struct ve_struct *ve, *ve_old; + + ve_old = skb->owner_env; + + read_lock(&veip_hash_lock); + if (!ve_is_super(ve_old)) { + /* from VE to host */ + ve = venet_find_ve(skb, 0); + if (ve == NULL) + goto out_drop; + if (!ve_accessible_strict(ve, ve_old)) + goto out_source; + skb->owner_env = get_ve0(); + } else { + /* from host to VE */ + ve = venet_find_ve(skb, 1); + if (ve == NULL) + goto out_drop; + skb->owner_env = ve; + } + read_unlock(&veip_hash_lock); + + return 0; + +out_drop: + read_unlock(&veip_hash_lock); + return -ESRCH; + +out_source: + read_unlock(&veip_hash_lock); + if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) { + printk(KERN_WARNING "Dropped packet, source wrong " + "veid=%u src-IP=%u.%u.%u.%u " + "dst-IP=%u.%u.%u.%u\n", + skb->owner_env->veid, + NIPQUAD(ip_hdr(skb)->saddr), + NIPQUAD(ip_hdr(skb)->daddr)); + } + return -EACCES; +} + +#ifdef CONFIG_PROC_FS +int veip_seq_show(struct seq_file *m, void *v) +{ + struct list_head *p; + struct ip_entry_struct *entry; + char s[40]; + + p = (struct list_head *)v; + if (p == ip_entry_hash_table) { + seq_puts(m, "Version: 2.5\n"); + return 0; + } + entry = list_entry(p, struct ip_entry_struct, ip_hash); + veaddr_print(s, sizeof(s), &entry->addr); + seq_printf(m, "%39s %10u\n", s, 0); + return 0; +} +#endif + +__exit void veip_cleanup(void) +{ + int i; + + write_lock_irq(&veip_hash_lock); + for (i = 0; i < VEIP_HASH_SZ; i++) + while (!list_empty(ip_entry_hash_table + i)) { + struct ip_entry_struct *entry; + + entry = list_first_entry(ip_entry_hash_table + i, + struct ip_entry_struct, ip_hash); + list_del(&entry->ip_hash); + kfree(entry); + } + write_unlock_irq(&veip_hash_lock); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Virtual Network Device"); +MODULE_LICENSE("GPL v2"); diff -urNp linux-2.6.32.48/drivers/net/ppp_generic.c linux-2.6.32.48-openvz/drivers/net/ppp_generic.c --- linux-2.6.32.48/drivers/net/ppp_generic.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/ppp_generic.c 2011-11-21 17:40:45.000000000 -0500 @@ -53,6 +53,9 @@ #include #include +#include +#include + #define PPP_VERSION "2.4.2" /* @@ -366,8 +369,10 @@ static int ppp_open(struct inode *inode, /* * This could (should?) be enforced by the permissions on /dev/ppp. */ - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; + if (!net_generic(get_exec_env()->ve_netns, ppp_net_id)) /* no VE_FEATURE_PPP */ + return -EACCES; return 0; } @@ -867,6 +872,9 @@ static __net_init int ppp_init_net(struc struct ppp_net *pn; int err; + if (!(get_exec_env()->features & VE_FEATURE_PPP)) + return 0; + pn = kzalloc(sizeof(*pn), GFP_KERNEL); if (!pn) return -ENOMEM; @@ -893,6 +901,9 @@ static __net_exit void ppp_exit_net(stru struct ppp_net *pn; pn = net_generic(net, ppp_net_id); + if (!pn) /* no VE_FEATURE_PPP */ + return; + idr_destroy(&pn->units_idr); /* * if someone has cached our net then @@ -1053,7 +1064,7 @@ static void ppp_setup(struct net_device dev->tx_queue_len = 3; dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_VIRTUAL; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } @@ -2568,16 +2579,16 @@ ppp_create_interface(struct net *net, in */ dev_net_set(dev, net); - ret = -EEXIST; mutex_lock(&pn->all_ppp_mutex); if (unit < 0) { unit = unit_get(&pn->units_idr, ppp); if (unit < 0) { - *retp = unit; + ret = unit; goto out2; } } else { + ret = -EEXIST; if (unit_find(&pn->units_idr, unit)) goto out2; /* unit already exists */ /* @@ -2652,10 +2663,10 @@ static void ppp_shutdown_interface(struc ppp->closing = 1; ppp_unlock(ppp); unregister_netdev(ppp->dev); + unit_put(&pn->units_idr, ppp->file.index); } else ppp_unlock(ppp); - unit_put(&pn->units_idr, ppp->file.index); ppp->file.dead = 1; ppp->owner = NULL; wake_up_interruptible(&ppp->file.rwait); @@ -2843,8 +2854,7 @@ static void __exit ppp_cleanup(void) * by holding all_ppp_mutex */ -/* associate pointer with specified number */ -static int unit_set(struct idr *p, void *ptr, int n) +static int __unit_alloc(struct idr *p, void *ptr, int n) { int unit, err; @@ -2855,10 +2865,24 @@ again: } err = idr_get_new_above(p, ptr, n, &unit); - if (err == -EAGAIN) - goto again; + if (err < 0) { + if (err == -EAGAIN) + goto again; + return err; + } - if (unit != n) { + return unit; +} + +/* associate pointer with specified number */ +static int unit_set(struct idr *p, void *ptr, int n) +{ + int unit; + + unit = __unit_alloc(p, ptr, n); + if (unit < 0) + return unit; + else if (unit != n) { idr_remove(p, unit); return -EINVAL; } @@ -2869,19 +2893,7 @@ again: /* get new free unit number and associate pointer with it */ static int unit_get(struct idr *p, void *ptr) { - int unit, err; - -again: - if (!idr_pre_get(p, GFP_KERNEL)) { - printk(KERN_ERR "PPP: No free memory for idr\n"); - return -ENOMEM; - } - - err = idr_get_new_above(p, ptr, 0, &unit); - if (err == -EAGAIN) - goto again; - - return unit; + return __unit_alloc(p, ptr, 0); } /* put unit number back to a pool */ diff -urNp linux-2.6.32.48/drivers/net/pppoe.c linux-2.6.32.48-openvz/drivers/net/pppoe.c --- linux-2.6.32.48/drivers/net/pppoe.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/pppoe.c 2011-11-21 17:40:45.000000000 -0500 @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -452,6 +453,8 @@ static int pppoe_rcv(struct sk_buff *skb goto drop; pn = pppoe_pernet(dev_net(dev)); + if (!pn) /* no VE_FEATURE_PPP */ + goto drop; /* Note that get_item does a sock_hold(), so sk_pppox(po) * is known to be safe. @@ -494,6 +497,9 @@ static int pppoe_disc_rcv(struct sk_buff goto abort; pn = pppoe_pernet(dev_net(dev)); + if (!pn) /* no VE_FEATURE_PPP */ + goto abort; + po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (po) { struct sock *sk = sk_pppox(po); @@ -547,6 +553,9 @@ static int pppoe_create(struct net *net, { struct sock *sk; + if (!(get_exec_env()->features & VE_FEATURE_PPP)) + return -EACCES; + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto); if (!sk) return -ENOMEM; @@ -1144,6 +1153,9 @@ static __net_init int pppoe_init_net(str struct proc_dir_entry *pde; int err; + if (!(get_exec_env()->features & VE_FEATURE_PPP)) + return 0; + pn = kzalloc(sizeof(*pn), GFP_KERNEL); if (!pn) return -ENOMEM; @@ -1173,8 +1185,11 @@ static __net_exit void pppoe_exit_net(st { struct pppoe_net *pn; - proc_net_remove(net, "pppoe"); pn = net_generic(net, pppoe_net_id); + if (!pn) /* no VE_FEATURE_PPP */ + return; + + proc_net_remove(net, "pppoe"); /* * if someone has cached our net then * further net_generic call will return NULL diff -urNp linux-2.6.32.48/drivers/net/pppol2tp.c linux-2.6.32.48-openvz/drivers/net/pppol2tp.c --- linux-2.6.32.48/drivers/net/pppol2tp.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/pppol2tp.c 2011-11-21 17:40:45.000000000 -0500 @@ -97,6 +97,7 @@ #include #include #include +#include #include #include @@ -1591,6 +1592,9 @@ static int pppol2tp_create(struct net *n int error = -ENOMEM; struct sock *sk; + if (!(get_exec_env()->features & VE_FEATURE_PPP)) + return -EACCES; + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto); if (!sk) goto out; @@ -2609,6 +2613,9 @@ static __net_init int pppol2tp_init_net( struct proc_dir_entry *pde; int err; + if (!(get_exec_env()->features & VE_FEATURE_PPP)) + return 0; + pn = kzalloc(sizeof(*pn), GFP_KERNEL); if (!pn) return -ENOMEM; @@ -2639,8 +2646,11 @@ static __net_exit void pppol2tp_exit_net { struct pppoe_net *pn; - proc_net_remove(net, "pppol2tp"); pn = net_generic(net, pppol2tp_net_id); + if (!pn) /* no VE_FEATURE_PPP */ + return; + + proc_net_remove(net, "pppol2tp"); /* * if someone has cached our net then * further net_generic call will return NULL diff -urNp linux-2.6.32.48/drivers/net/tun.c linux-2.6.32.48-openvz/drivers/net/tun.c --- linux-2.6.32.48/drivers/net/tun.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/tun.c 2011-11-21 17:40:45.000000000 -0500 @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -69,6 +70,9 @@ #include #include +#include +#include + /* Uncomment to enable debugging */ /* #define TUN_DEBUG 1 */ @@ -93,6 +97,7 @@ struct tun_file { atomic_t count; struct tun_struct *tun; struct net *net; + struct file *file; }; struct tun_sock; @@ -124,6 +129,15 @@ static inline struct tun_sock *tun_sk(st return container_of(sk, struct tun_sock, sk); } +static void __tun_attach(struct tun_struct *tun, struct tun_file *tfile) +{ + tfile->tun = tun; + tun->tfile = tfile; + dev_hold(tun->dev); + sock_hold(tun->socket.sk); + atomic_inc(&tfile->count); +} + static int tun_attach(struct tun_struct *tun, struct file *file) { struct tun_file *tfile = file->private_data; @@ -142,12 +156,7 @@ static int tun_attach(struct tun_struct goto out; err = 0; - tfile->tun = tun; - tun->tfile = tfile; - dev_hold(tun->dev); - sock_hold(tun->socket.sk); - atomic_inc(&tfile->count); - + __tun_attach(tun, tfile); out: netif_tx_unlock_bh(tun->dev); return err; @@ -418,12 +427,16 @@ tun_net_change_mtu(struct net_device *de return 0; } +static void tun_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context * ctx); + static const struct net_device_ops tun_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_change_mtu = tun_net_change_mtu, + .ndo_cpt = tun_cpt, }; static const struct net_device_ops tap_netdev_ops = { @@ -435,6 +448,7 @@ static const struct net_device_ops tap_n .ndo_set_multicast_list = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, + .ndo_cpt = tun_cpt, }; /* Initialize net device. */ @@ -513,12 +527,8 @@ static inline struct sk_buff *tun_alloc_ struct sk_buff *skb; int err; - /* Under a page? Don't bother with paged skb. */ - if (prepad + len < PAGE_SIZE || !linear) - linear = len; - - skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - &err); + linear = len; + skb = sock_alloc_send_skb(sk, prepad + linear, noblock, &err); if (!skb) return ERR_PTR(err); @@ -819,6 +829,7 @@ static void tun_setup(struct net_device dev->ethtool_ops = &tun_ethtool_ops; dev->destructor = tun_free_netdev; + dev->features |= NETIF_F_VIRTUAL; } /* Trivial set of netlink ops to allow deleting tun or tap @@ -864,6 +875,29 @@ static struct proto tun_proto = { .obj_size = sizeof(struct tun_sock), }; +static int tun_sk_alloc_init(struct net *net, struct tun_struct *tun, + struct sock **psk) +{ + struct sock *sk; + + sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto); + if (!sk) + return -ENOMEM; + + init_waitqueue_head(&tun->socket.wait); + sock_init_data(&tun->socket, sk); + sk->sk_write_space = tun_sock_write_space; + sk->sk_sndbuf = INT_MAX; + + container_of(sk, struct tun_sock, sk)->tun = tun; + + security_tun_dev_post_create(sk); + + *psk = sk; + return 0; + +} + static int tun_flags(struct tun_struct *tun) { int flags = 0; @@ -932,7 +966,7 @@ static int tun_set_iff(struct net *net, if (((tun->owner != -1 && cred->euid != tun->owner) || (tun->group != -1 && !in_egroup_p(tun->group))) && - !capable(CAP_NET_ADMIN)) + !capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; err = security_tun_dev_attach(tun->socket.sk); if (err < 0) @@ -946,7 +980,7 @@ static int tun_set_iff(struct net *net, char *name; unsigned long flags = 0; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; err = security_tun_dev_create(); if (err < 0) @@ -980,20 +1014,10 @@ static int tun_set_iff(struct net *net, tun->flags = flags; tun->txflt.count = 0; - err = -ENOMEM; - sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto); - if (!sk) + err = tun_sk_alloc_init(net, tun, &sk); + if (err) goto err_free_dev; - init_waitqueue_head(&tun->socket.wait); - sock_init_data(&tun->socket, sk); - sk->sk_write_space = tun_sock_write_space; - sk->sk_sndbuf = INT_MAX; - - container_of(sk, struct tun_sock, sk)->tun = tun; - - security_tun_dev_post_create(sk); - tun_net_init(dev); if (strchr(dev->name, '%')) { @@ -1006,10 +1030,10 @@ static int tun_set_iff(struct net *net, if (err < 0) goto err_free_sk; - if (!net_eq(dev_net(tun->dev), &init_net) || - device_create_file(&tun->dev->dev, &dev_attr_tun_flags) || - device_create_file(&tun->dev->dev, &dev_attr_owner) || - device_create_file(&tun->dev->dev, &dev_attr_group)) + if ((dev_net(tun->dev) == &init_net) && + (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) || + device_create_file(&tun->dev->dev, &dev_attr_owner) || + device_create_file(&tun->dev->dev, &dev_attr_group))) printk(KERN_ERR "Failed to create tun sysfs files\n"); sk->sk_destruct = tun_sock_destruct; @@ -1317,6 +1341,7 @@ static int tun_chr_open(struct inode *in tfile->tun = NULL; tfile->net = get_net(current->nsproxy->net_ns); file->private_data = tfile; + tfile->file = file; return 0; } @@ -1458,6 +1483,226 @@ static const struct ethtool_ops tun_etht .set_rx_csum = tun_set_rx_csum }; +static void cpt_dump_tap_filter(struct tap_filter *flt, + struct cpt_ops *ops, struct cpt_context *ctx) +{ + struct cpt_tap_filter_image v; + loff_t saved_obj; + + ops->push_object(&saved_obj, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_TAP_FILTER; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_count = flt->count; + + BUILD_BUG_ON(sizeof(flt->mask) != sizeof(v.cpt_mask)); + memcpy(v.cpt_mask, flt->mask, sizeof(v.cpt_mask)); + + BUILD_BUG_ON(sizeof(flt->addr) != sizeof(v.cpt_addr)); + memcpy(v.cpt_addr, flt->addr, sizeof(v.cpt_addr)); + + ops->write(&v, sizeof(v), ctx); + + ops->pop_object(&saved_obj, ctx); +} + +static void tun_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context * ctx) +{ + struct cpt_tuntap_image v; + struct tun_struct *tun; + + tun = netdev_priv(dev); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_TUNTAP; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_owner = tun->owner; + v.cpt_flags = tun->flags; + + if (tun->tfile->file) + v.cpt_bindfile = ops->lookup_object(CPT_OBJ_FILE, tun->tfile->file, ctx); + + v.cpt_if_flags = 0; + memset(v.cpt_dev_addr, 0, sizeof(v.cpt_dev_addr)); + memset(v.cpt_chr_filter, 0, sizeof(v.cpt_chr_filter)); + memset(v.cpt_net_filter, 0, sizeof(v.cpt_net_filter)); + + ops->write(&v, sizeof(v), ctx); + + cpt_dump_tap_filter(&tun->txflt, ops, ctx); +} + +static int rst_restore_tap_filter(loff_t start, struct cpt_tuntap_image *ti, + struct tap_filter *flt, struct rst_ops *ops, + struct cpt_context *ctx) +{ + int err; + struct cpt_tap_filter_image fi; + loff_t pos; + + /* disable filtering */ + flt->count = 0; + + pos = start + ti->cpt_hdrlen; + + /* no tap filter image? */ + if (pos >= start + ti->cpt_next) + goto convert; + + err = ops->get_object(CPT_OBJ_NET_TAP_FILTER, pos, + &fi, sizeof(fi), ctx); + if (err) + return err; + + BUILD_BUG_ON(sizeof(flt->mask) != sizeof(fi.cpt_mask)); + memcpy(flt->mask, fi.cpt_mask, sizeof(fi.cpt_mask)); + + BUILD_BUG_ON(sizeof(flt->addr) != sizeof(fi.cpt_addr)); + memcpy(flt->addr, fi.cpt_addr, sizeof(fi.cpt_addr)); + + flt->count = fi.cpt_count; + + return 0; + +convert: + /** From OLD filtering code: + * Decide whether to accept this packet. This code is designed to + * behave identically to an Ethernet interface. Accept the packet if + * - we are promiscuous. + * - the packet is addressed to us. + * - the packet is broadcast. + * - the packet is multicast and + * - we are multicast promiscous. + * - we belong to the multicast group. + */ + + /* accept all, this is default if filter is untouched */ + if (ti->cpt_if_flags & IFF_PROMISC) + return 0; + + /* accept packets addressed to character device's hardware address */ + BUILD_BUG_ON(sizeof(flt->addr[0]) != sizeof(ti->cpt_dev_addr)); + memcpy(flt->addr[0], ti->cpt_dev_addr, sizeof(ti->cpt_dev_addr)); + + /* accept broadcast */ + memset(flt->addr[1], ~0, sizeof(flt->addr[1])); + + /* accept hashed multicast: hash function the same as in old code */ + BUILD_BUG_ON(sizeof(flt->mask) != sizeof(ti->cpt_chr_filter)); + memcpy(flt->mask, ti->cpt_chr_filter, sizeof(ti->cpt_chr_filter)); + + /* accept all multicast */ + if (ti->cpt_if_flags & IFF_ALLMULTI) + memset(flt->mask, ~0, sizeof(flt->mask)); + + /* two exact filters: hw addr and broadcast */ + flt->count = 2; + + return 0; +} + +static int tun_rst(loff_t start, struct cpt_netdev_image *di, + struct rst_ops *ops, struct cpt_context *ctx) +{ + int err = -ENODEV; + struct cpt_tuntap_image ti; + struct net_device *dev; + struct file *bind_file = NULL; + struct tun_struct *tun; + struct tun_file *tfile; + struct sock *sk; + loff_t pos; + + pos = start + di->cpt_hdrlen; + err = ops->get_object(CPT_OBJ_NET_TUNTAP, pos, + &ti, sizeof(ti), ctx); + if (err) + return err; + + if (ti.cpt_bindfile) { + bind_file = ops->rst_file(ti.cpt_bindfile, -1, ctx); + if (IS_ERR(bind_file)) + return PTR_ERR(bind_file); + } + + tfile = kmalloc(sizeof(*tfile), GFP_KERNEL); + if (!tfile) + goto out; + + atomic_set(&tfile->count, 0); + tfile->tun = NULL; + tfile->net = get_net(current->nsproxy->net_ns); + tfile->file = bind_file; + + err = -ENOMEM; + dev = alloc_netdev(sizeof(struct tun_struct), di->cpt_name, tun_setup); + if (!dev) + goto out_tf; + + tun = netdev_priv(dev); + + tun->dev = dev; + tun->owner = ti.cpt_owner; + tun->flags = ti.cpt_flags; + tun_net_init(dev); + + err = tun_sk_alloc_init(current->nsproxy->net_ns, tun, &sk); + if (err) + goto out_netdev; + + err = rst_restore_tap_filter(pos, &ti, &tun->txflt, ops, ctx); + if (err < 0) + goto out_sk; + + err = register_netdevice(dev); + if (err < 0) + goto out_sk; + + pos += ti.cpt_next; + if (pos < start + di->cpt_next) { + struct cpt_hwaddr_image hw; + /* Restore hardware address */ + err = ops->get_object(CPT_OBJ_NET_HWADDR, pos, + &hw, sizeof(hw), ctx); + if (err) + goto out_unreg; + + memcpy(dev->dev_addr, hw.cpt_dev_addr, + sizeof(hw.cpt_dev_addr)); + } + + sk->sk_destruct = tun_sock_destruct; + bind_file->private_data = tfile; + __tun_attach(tun, tfile); + + fput(bind_file); + return 0; + +out_unreg: + unregister_netdevice(dev); +out_sk: + sock_put(sk); +out_netdev: + free_netdev(dev); +out_tf: + put_net(tfile->net); + kfree(tfile); +out: + fput(bind_file); + return err; +} + +static struct netdev_rst tun_netdev_rst = { + .cpt_object = CPT_OBJ_NET_TUNTAP, + .ndo_rst = tun_rst, +}; + static int __init tun_init(void) { @@ -1477,6 +1722,8 @@ static int __init tun_init(void) printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR); goto err_misc; } + + register_netdev_rst(&tun_netdev_rst); return 0; err_misc: rtnl_link_unregister(&tun_link_ops); @@ -1486,6 +1733,7 @@ err_linkops: static void tun_cleanup(void) { + unregister_netdev_rst(&tun_netdev_rst); misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); } diff -urNp linux-2.6.32.48/drivers/net/venet_core.c linux-2.6.32.48-openvz/drivers/net/venet_core.c --- linux-2.6.32.48/drivers/net/venet_core.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/venet_core.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,864 @@ +/* + * venet_core.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Common part for Virtuozzo virtual network devices + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include /* For ARPHRD_ETHER */ +#include +#include +#include +#include +#include + +struct list_head ip_entry_hash_table[VEIP_HASH_SZ]; +rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED; +LIST_HEAD(veip_lh); + +#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1)) + +void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip) +{ + list_add(&entry->ip_hash, + ip_entry_hash_table + + ip_entry_hash_function(entry->addr.key[3])); + list_add(&entry->ve_list, &veip->ip_lh); +} + +void veip_put(struct veip_struct *veip) +{ + if (!list_empty(&veip->ip_lh)) + return; + if (!list_empty(&veip->src_lh)) + return; + if (!list_empty(&veip->dst_lh)) + return; + + list_del(&veip->list); + kfree(veip); +} + +struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr) +{ + struct ip_entry_struct *entry; + + list_for_each_entry (entry, ip_entry_hash_table + + ip_entry_hash_function(addr->key[3]), ip_hash) + if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0) + return entry; + return NULL; +} + +struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve, + struct ve_addr_struct *addr) +{ + struct ext_entry_struct *entry; + + if (ve->veip == NULL) + return NULL; + + list_for_each_entry (entry, &ve->veip->ext_lh, list) + if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0) + return entry; + return NULL; +} + +int venet_ext_add(struct ve_struct *ve, struct ve_addr_struct *addr) +{ + struct ext_entry_struct *entry, *found; + int err; + + if (ve->veip == NULL) + return -ENONET; + + entry = kzalloc(sizeof(struct ext_entry_struct), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + + write_lock_irq(&veip_hash_lock); + err = -EADDRINUSE; + found = venet_ext_lookup(ve, addr); + if (found != NULL) + goto out_unlock; + + entry->addr = *addr; + list_add(&entry->list, &ve->veip->ext_lh); + err = 0; + entry = NULL; +out_unlock: + write_unlock_irq(&veip_hash_lock); + if (entry != NULL) + kfree(entry); + return err; +} + +int venet_ext_del(struct ve_struct *ve, struct ve_addr_struct *addr) +{ + struct ext_entry_struct *found; + int err; + + if (ve->veip == NULL) + return -ENONET; + + err = -EADDRNOTAVAIL; + write_lock_irq(&veip_hash_lock); + found = venet_ext_lookup(ve, addr); + if (found == NULL) + goto out; + + list_del(&found->list); + kfree(found); + err = 0; +out: + write_unlock_irq(&veip_hash_lock); + return err; +} + +void venet_ext_clean(struct ve_struct *ve) +{ + struct ext_entry_struct *entry, *tmp; + + if (ve->veip == NULL) + return; + + write_lock_irq(&veip_hash_lock); + list_for_each_entry_safe (entry, tmp, &ve->veip->ext_lh, list) { + list_del(&entry->list); + kfree(entry); + } + write_unlock_irq(&veip_hash_lock); +} + +struct veip_struct *veip_find(envid_t veid) +{ + struct veip_struct *ptr; + + list_for_each_entry(ptr, &veip_lh, list) { + if (ptr->veid != veid) + continue; + return ptr; + } + return NULL; +} + +struct veip_struct *veip_findcreate(envid_t veid) +{ + struct veip_struct *ptr; + + ptr = veip_find(veid); + if (ptr != NULL) + return ptr; + + ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC); + if (ptr == NULL) + return NULL; + memset(ptr, 0, sizeof(struct veip_struct)); + INIT_LIST_HEAD(&ptr->ip_lh); + INIT_LIST_HEAD(&ptr->src_lh); + INIT_LIST_HEAD(&ptr->dst_lh); + INIT_LIST_HEAD(&ptr->ext_lh); + ptr->veid = veid; + list_add(&ptr->list, &veip_lh); + return ptr; +} + +static int convert_sockaddr(struct sockaddr *addr, int addrlen, + struct ve_addr_struct *veaddr) +{ + int err; + + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *sin; + + err = -EINVAL; + if (addrlen != sizeof(struct sockaddr_in)) + break; + + err = 0; + sin = (struct sockaddr_in *)addr; + veaddr->family = AF_INET; + veaddr->key[0] = 0; + veaddr->key[1] = 0; + veaddr->key[2] = 0; + veaddr->key[3] = sin->sin_addr.s_addr; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin; + + err = -EINVAL; + if (addrlen != sizeof(struct sockaddr_in6)) + break; + + err = 0; + sin = (struct sockaddr_in6 *)addr; + veaddr->family = AF_INET6; + memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key)); + break; + } + default: + err = -EAFNOSUPPORT; + } + return err; +} + +int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, + struct ve_addr_struct *veaddr) +{ + int err; + char addr[MAX_SOCK_ADDR]; + + err = move_addr_to_kernel(uaddr, addrlen, (struct sockaddr *)&addr); + if (err < 0) + goto out; + + err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr); +out: + return err; +} + +void veaddr_print(char *str, int len, struct ve_addr_struct *a) +{ + if (a->family == AF_INET) + snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3])); + else + snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x", + ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF, + ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF, + ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF, + ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF + ); +} + +/* + * Device functions + */ + +static int venet_open(struct net_device *dev) +{ + if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE)) + return -EBUSY; + return 0; +} + +static int venet_close(struct net_device *master) +{ + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); + return 0; +} + +static void venet_destructor(struct net_device *dev) +{ + struct venet_stats *stats = (struct venet_stats *)dev->ml_priv; + if (stats == NULL) + return; + free_percpu(stats->real_stats); + kfree(stats); + dev->ml_priv = NULL; +} + +/* + * The higher levels take care of making this non-reentrant (it's + * called with bh's disabled). + */ +static int venet_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats; + struct net_device *rcv = NULL; + int length; + + stats = venet_stats(dev, smp_processor_id()); + if (unlikely(get_exec_env()->disable_net)) + goto outf; + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + struct iphdr *iph; + iph = ip_hdr(skb); + if (ipv4_is_multicast(iph->daddr)) + goto outf; + } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h; + ip6h = ipv6_hdr(skb); + if (ipv6_addr_is_multicast(&ip6h->daddr)) + goto outf; + skb_orphan(skb); + } else { + goto outf; + } + + if (venet_change_skb_owner(skb) < 0) + goto outf; + + if (unlikely(skb->owner_env->disable_net)) + goto outf; + + rcv = skb->owner_env->_venet_dev; + if (!rcv) + /* VE going down */ + goto outf; + + dev_hold(rcv); + + if (!(rcv->flags & IFF_UP)) { + /* Target VE does not want to receive packets */ + dev_put(rcv); + goto outf; + } + + skb->pkt_type = PACKET_HOST; + skb->dev = rcv; + + skb_reset_mac_header(skb); + memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len); + + nf_reset(skb); + length = skb->len; + + netif_rx(skb); + + stats->tx_bytes += length; + stats->tx_packets++; + if (rcv) { + struct net_device_stats *rcv_stats; + + rcv_stats = venet_stats(rcv, smp_processor_id()); + rcv_stats->rx_bytes += length; + rcv_stats->rx_packets++; + dev_put(rcv); + } + + return 0; + +outf: + kfree_skb(skb); + ++stats->tx_dropped; + return 0; +} + +static struct net_device_stats *get_stats(struct net_device *dev) +{ + int i; + struct venet_stats *stats; + + stats = (struct venet_stats *)dev->ml_priv; + memset(&stats->stats, 0, sizeof(struct net_device_stats)); + for (i=0; i < NR_CPUS; i++) { + struct net_device_stats *dev_stats; + + if (!cpu_possible(i)) + continue; + dev_stats = venet_stats(dev, i); + stats->stats.rx_bytes += dev_stats->rx_bytes; + stats->stats.tx_bytes += dev_stats->tx_bytes; + stats->stats.rx_packets += dev_stats->rx_packets; + stats->stats.tx_packets += dev_stats->tx_packets; + } + + return &stats->stats; +} + +/* Initialize the rest of the LOOPBACK device. */ +int venet_init_dev(struct net_device *dev) +{ + struct venet_stats *stats; + + stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL); + if (stats == NULL) + goto fail; + stats->real_stats = alloc_percpu(struct net_device_stats); + if (stats->real_stats == NULL) + goto fail_free; + dev->ml_priv = stats; + + /* + * Fill in the generic fields of the device structure. + */ + dev->type = ARPHRD_VOID; + dev->hard_header_len = ETH_HLEN; + dev->mtu = 1500; /* eth_mtu */ + dev->tx_queue_len = 0; + + memset(dev->broadcast, 0xFF, ETH_ALEN); + + /* New-style flags. */ + dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT; + return 0; + +fail_free: + kfree(stats); +fail: + return -ENOMEM; +} + +static const struct net_device_ops venet_netdev_ops; + +static int +venet_set_op(struct net_device *dev, u32 data, + int (*fop)(struct net_device *, u32)) +{ + + struct ve_struct *ve; + int ret = 0; + + read_lock(&ve_list_lock); + for_each_ve(ve) { + struct ve_struct *ve_old; + + ve_old = set_exec_env(ve); + read_lock(&dev_base_lock); + for_each_netdev(ve->ve_netns, dev) { + if (dev->netdev_ops == &venet_netdev_ops) + ret = fop(dev, data); + } + read_unlock(&dev_base_lock); + set_exec_env(ve_old); + + if (ret < 0) + break; + } + read_unlock(&ve_list_lock); + return ret; +} + +static unsigned long common_features; + +static int venet_op_set_sg(struct net_device *dev, u32 data) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + if (data) + common_features |= NETIF_F_SG; + else + common_features &= ~NETIF_F_SG; + + return venet_set_op(dev, data, ethtool_op_set_sg); +} + +static int venet_op_set_tx_csum(struct net_device *dev, u32 data) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + if (data) + common_features |= NETIF_F_IP_CSUM; + else + common_features &= ~NETIF_F_IP_CSUM; + + return venet_set_op(dev, data, ethtool_op_set_tx_csum); +} + +static int +venet_op_set_tso(struct net_device *dev, u32 data) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + if (data) + common_features |= NETIF_F_TSO; + else + common_features &= ~NETIF_F_TSO; + + return venet_set_op(dev, data, ethtool_op_set_tso); +} + +#define venet_op_set_rx_csum venet_op_set_tx_csum + +static struct ethtool_ops venet_ethtool_ops = { + .get_sg = ethtool_op_get_sg, + .set_sg = venet_op_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = venet_op_set_tx_csum, + .get_rx_csum = ethtool_op_get_tx_csum, + .set_rx_csum = venet_op_set_rx_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = venet_op_set_tso, +}; + +static void venet_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx) +{ +} + +static const struct net_device_ops venet_netdev_ops = { + .ndo_start_xmit = venet_xmit, + .ndo_get_stats = get_stats, + .ndo_open = venet_open, + .ndo_stop = venet_close, + .ndo_init = venet_init_dev, + .ndo_cpt = venet_cpt, +}; + +static void venet_setup(struct net_device *dev) +{ + /* + * No other features, as they are: + * - checksumming is required, and nobody else will done our job + */ + dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX | + NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED; + + dev->netdev_ops = &venet_netdev_ops; + dev->destructor = venet_destructor; + + dev->features |= common_features; + + SET_ETHTOOL_OPS(dev, &venet_ethtool_ops); +} + +#ifdef CONFIG_PROC_FS +static void veaddr_seq_print(struct seq_file *m, struct ve_struct *ve) +{ + struct ip_entry_struct *entry; + + read_lock(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; + list_for_each_entry (entry, &ve->veip->ip_lh, ve_list) { + char addr[40]; + + if (entry->active_env == NULL) + continue; + + veaddr_print(addr, sizeof(addr), &entry->addr); + if (entry->addr.family == AF_INET) + seq_printf(m, " %15s", addr); + else + seq_printf(m, " %39s", addr); + } +unlock: + read_unlock(&veip_hash_lock); +} + +static void *veip_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t l; + struct list_head *p; + int i; + + l = *pos; + write_lock_irq(&veip_hash_lock); + if (l == 0) + return ip_entry_hash_table; + for (i = 0; i < VEIP_HASH_SZ; i++) { + list_for_each(p, ip_entry_hash_table + i) { + if (--l == 0) + return p; + } + } + return NULL; +} + +static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *p; + + p = (struct list_head *)v; + while (1) { + p = p->next; + if (p < ip_entry_hash_table || + p >= ip_entry_hash_table + VEIP_HASH_SZ) { + (*pos)++; + return p; + } + if (++p >= ip_entry_hash_table + VEIP_HASH_SZ) + return NULL; + } + return NULL; +} + +static void veip_seq_stop(struct seq_file *m, void *v) +{ + write_unlock_irq(&veip_hash_lock); +} + +static struct seq_operations veip_seq_op = { + .start = veip_seq_start, + .next = veip_seq_next, + .stop = veip_seq_stop, + .show = veip_seq_show, +}; + +static int veip_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &veip_seq_op); +} + +static struct file_operations proc_veip_operations = { + .open = veip_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + +static int real_ve_ip_map(envid_t veid, int op, struct sockaddr __user *uaddr, + int addrlen) +{ + int err; + struct ve_struct *ve; + struct ve_addr_struct addr; + + err = -EPERM; + if (!capable_setveid()) + goto out; + + err = sockaddr_to_veaddr(uaddr, addrlen, &addr); + if (err < 0) + goto out; + + switch (op) + { + case VE_IP_ADD: + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veip_entry_add(ve, &addr); + up_read(&ve->op_sem); + put_ve(ve); + break; + + case VE_IP_DEL: + err = veip_entry_del(veid, &addr); + break; + case VE_IP_EXT_ADD: + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + err = venet_ext_add(ve, &addr); + up_read(&ve->op_sem); + put_ve(ve); + break; + case VE_IP_EXT_DEL: + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + err = venet_ext_del(ve, &addr); + up_read(&ve->op_sem); + put_ve(ve); + break; + default: + err = -EINVAL; + } + +out: + return err; +} + +int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VENETCTL_VE_IP_MAP: { + struct vzctl_ve_ip_map s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen); + break; + } + } + return err; +} + +#ifdef CONFIG_COMPAT +int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch(cmd) { + case VENETCTL_COMPAT_VE_IP_MAP: { + struct compat_vzctl_ve_ip_map cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr), + cs.addrlen); + break; + } + default: + err = venet_ioctl(file, cmd, arg); + break; + } + return err; +} +#endif + +static struct vzioctlinfo venetcalls = { + .type = VENETCTLTYPE, + .ioctl = venet_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_venet_ioctl, +#endif + .owner = THIS_MODULE, +}; + +int venet_dev_start(struct ve_struct *ve) +{ + struct net_device *dev_venet; + int err; + + dev_venet = alloc_netdev(0, "venet%d", venet_setup); + if (!dev_venet) + return -ENOMEM; + dev_net_set(dev_venet, ve->ve_netns); + err = dev_alloc_name(dev_venet, dev_venet->name); + if (err<0) + goto err; + if ((err = register_netdev(dev_venet)) != 0) + goto err; + ve->_venet_dev = dev_venet; + return 0; +err: + free_netdev(dev_venet); + printk(KERN_ERR "VENET initialization error err=%d\n", err); + return err; +} + +static int venet_start(void *data) +{ + struct ve_struct *env; + int err; + + env = (struct ve_struct *)data; + if (env->veip) + return -EEXIST; + + err = veip_start(env); + if (err != 0) + return err; + + err = venet_dev_start(env); + if (err) + goto err_free; + return 0; + +err_free: + veip_stop(env); + return err; +} + +static void venet_stop(void *data) +{ + struct ve_struct *env; + struct net_device *dev; + + env = (struct ve_struct *)data; + venet_ext_clean(env); + veip_stop(env); + + dev = env->_venet_dev; + if (dev == NULL) + return; + + unregister_netdev(dev); + env->_venet_dev = NULL; + free_netdev(dev); +} + +static struct ve_hook venet_ve_hook = { + .init = venet_start, + .fini = venet_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; + +__init int venet_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *de; +#endif + int i, err; + + if (get_ve0()->_venet_dev != NULL) + return -EEXIST; + + for (i = 0; i < VEIP_HASH_SZ; i++) + INIT_LIST_HEAD(ip_entry_hash_table + i); + + err = venet_start(get_ve0()); + if (err) + return err; + +#ifdef CONFIG_PROC_FS + de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir, + &proc_veip_operations); + if (de == NULL) + printk(KERN_WARNING "venet: can't make veip proc entry\n"); +#endif + + ve_hook_register(VE_SS_CHAIN, &venet_ve_hook); + vzioctl_register(&venetcalls); + vzmon_register_veaddr_print_cb(veaddr_seq_print); + return 0; +} + +__exit void venet_exit(void) +{ + vzmon_unregister_veaddr_print_cb(veaddr_seq_print); + vzioctl_unregister(&venetcalls); + ve_hook_unregister(&venet_ve_hook); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("veip", proc_vz_dir); +#endif + venet_stop(get_ve0()); + veip_cleanup(); +} + +module_init(venet_init); +module_exit(venet_exit); diff -urNp linux-2.6.32.48/drivers/net/veth.c linux-2.6.32.48-openvz/drivers/net/veth.c --- linux-2.6.32.48/drivers/net/veth.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/veth.c 2011-11-21 17:40:45.000000000 -0500 @@ -180,6 +180,7 @@ static netdev_tx_t veth_xmit(struct sk_b skb->mark = 0; secpath_reset(skb); nf_reset(skb); + skb_init_brmark(skb); length = skb->len; diff -urNp linux-2.6.32.48/drivers/net/vzethdev.c linux-2.6.32.48-openvz/drivers/net/vzethdev.c --- linux-2.6.32.48/drivers/net/vzethdev.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/net/vzethdev.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,749 @@ +/* + * veth.c + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Virtual ethernet device used to change VE ownership on packets + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include /* For ARPHRD_ETHER */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static LIST_HEAD(veth_hwaddr_list); +static DEFINE_RWLOCK(ve_hwaddr_lock); +static DECLARE_MUTEX(hwaddr_sem); + +struct net_device * veth_dev_start(char *dev_addr, char *name); + +struct veth_struct *hwaddr_entry_lookup(char *name) +{ + struct veth_struct *entry; + + list_for_each_entry(entry, &veth_hwaddr_list, hwaddr_list) { + BUG_ON(entry->pair == NULL); + if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0) + return entry; + } + return NULL; +} + +int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name, + char *dev_addr_ve, char *name_ve) +{ + struct net_device *dev_ve; + struct net_device *dev_ve0; + struct ve_struct *old_env; + char dev_name[IFNAMSIZ]; + int err; + + down(&hwaddr_sem); + + if (name[0] == '\0') + snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid); + else { + memcpy(dev_name, name, IFNAMSIZ - 1); + dev_name[IFNAMSIZ - 1] = '\0'; + } + dev_ve0 = veth_dev_start(dev_addr, dev_name); + if (IS_ERR(dev_ve0)) { + err = PTR_ERR(dev_ve0); + goto err; + } + + old_env = set_exec_env(ve); + if (name_ve[0] == '\0') + sprintf(dev_name, "eth%%d"); + else { + memcpy(dev_name, name_ve, IFNAMSIZ - 1); + dev_name[IFNAMSIZ - 1] = '\0'; + } + dev_ve = veth_dev_start(dev_addr_ve, dev_name); + if (IS_ERR(dev_ve)) { + err = PTR_ERR(dev_ve); + goto err_ve; + } + set_exec_env(old_env); + veth_from_netdev(dev_ve)->pair = dev_ve0; + veth_from_netdev(dev_ve)->me = dev_ve; + veth_from_netdev(dev_ve0)->pair = dev_ve; + veth_from_netdev(dev_ve0)->me = dev_ve0; + + write_lock(&ve_hwaddr_lock); + list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list); + write_unlock(&ve_hwaddr_lock); + + up(&hwaddr_sem); + return 0; + +err_ve: + set_exec_env(old_env); + unregister_netdev(dev_ve0); +err: + up(&hwaddr_sem); + return err; +} + +void veth_pair_del(struct ve_struct *env, struct veth_struct *entry) +{ + struct net_device *dev; + struct ve_struct *old_env; + + write_lock(&ve_hwaddr_lock); + list_del(&entry->hwaddr_list); + write_unlock(&ve_hwaddr_lock); + + dev = entry->pair; + BUG_ON(entry->pair == NULL); + + veth_from_netdev(dev)->pair = NULL; + entry->pair = NULL; + rtnl_lock(); + old_env = set_exec_env(dev->owner_env); + dev_close(dev); + + /* + * Now device from VE0 does not send or receive anything, + * i.e. dev->hard_start_xmit won't be called. + */ + set_exec_env(env); + unregister_netdevice(veth_to_netdev(entry)); + set_exec_env(dev->owner_env); + unregister_netdevice(dev); + set_exec_env(old_env); + rtnl_unlock(); +} + +int veth_entry_del(struct ve_struct *ve, char *name) +{ + struct veth_struct *found; + int err; + + err = -ENODEV; + down(&hwaddr_sem); + found = hwaddr_entry_lookup(name); + if (found == NULL) + goto out; + if (veth_to_netdev(found)->owner_env != ve) + goto out; + + err = 0; + veth_pair_del(ve, found); + +out: + up(&hwaddr_sem); + return err; +} + +int veth_allow_change_mac(envid_t veid, char *name, int allow) +{ + struct ve_struct *ve; + struct veth_struct *found; + int err; + + err = -ESRCH; + ve = get_ve_by_id(veid); + if (!ve) + return err; + + down_read(&ve->op_sem); + if (!ve->is_running) + goto out_ve; + err = -ENODEV; + down(&hwaddr_sem); + found = hwaddr_entry_lookup(name); + if (found == NULL) + goto out_sem; + if (veth_to_netdev(found)->owner_env != ve) + goto out_sem; + + err = 0; + found->allow_mac_change = allow; + +out_sem: + up(&hwaddr_sem); +out_ve: + up_read(&ve->op_sem); + put_ve(ve); + return err; +} + +/* + * Device functions + */ + +static int veth_open(struct net_device *dev) +{ + return 0; +} + +static int veth_close(struct net_device *master) +{ + return 0; +} + +static void veth_destructor(struct net_device *dev) +{ + free_percpu(veth_from_netdev(dev)->real_stats); + free_netdev(dev); +} + +static struct net_device_stats *get_stats(struct net_device *dev) +{ + int i; + struct net_device_stats *stats; + + stats = &veth_from_netdev(dev)->stats; + memset(stats, 0, sizeof(struct net_device_stats)); + for (i = 0; i < NR_CPUS; i++) { + struct net_device_stats *dev_stats; + + if (!cpu_possible(i)) + continue; + dev_stats = veth_stats(dev, i); + stats->rx_bytes += dev_stats->rx_bytes; + stats->tx_bytes += dev_stats->tx_bytes; + stats->rx_packets += dev_stats->rx_packets; + stats->tx_packets += dev_stats->tx_packets; + stats->tx_dropped += dev_stats->tx_dropped; + } + + return stats; +} + +/* + * The higher levels take care of making this non-reentrant (it's + * called with bh's disabled). + */ +static int veth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats; + struct net_device *rcv = NULL; + struct veth_struct *entry; + int length; + + stats = veth_stats(dev, smp_processor_id()); + if (unlikely(get_exec_env()->disable_net)) + goto outf; + + entry = veth_from_netdev(dev); + rcv = entry->pair; + if (!rcv) + /* VE going down */ + goto outf; + + if (!(rcv->flags & IFF_UP)) { + /* Target VE does not want to receive packets */ + goto outf; + } + + if (unlikely(rcv->owner_env->disable_net)) + goto outf; + /* Filtering */ + if (ve_is_super(dev->owner_env) && + !veth_from_netdev(rcv)->allow_mac_change) { + /* from VE0 to VEX */ + if (ve_is_super(rcv->owner_env)) + goto out; + if (is_multicast_ether_addr( + ((struct ethhdr *)skb->data)->h_dest)) + goto out; + if (!rcv->br_port && + compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, rcv->dev_addr)) + goto outf; + } else if (!ve_is_super(dev->owner_env) && + !entry->allow_mac_change) { + /* from VEX to VE0 */ + if (!skb->dev->br_port && + compare_ether_addr(((struct ethhdr *)skb->data)->h_source, dev->dev_addr)) + goto outf; + } + +out: + skb->owner_env = rcv->owner_env; + + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, rcv); + + if (skb->protocol != __constant_htons(ETH_P_IP)) + skb_orphan(skb); + + nf_reset(skb); + length = skb->len; + skb_init_brmark(skb); + + netif_rx(skb); + + stats->tx_bytes += length; + stats->tx_packets++; + if (rcv) { + struct net_device_stats *rcv_stats; + rcv_stats = veth_stats(rcv, smp_processor_id()); + rcv_stats->rx_bytes += length; + rcv_stats->rx_packets++; + } + + return 0; + +outf: + kfree_skb(skb); + stats->tx_dropped++; + return 0; +} + +static int veth_set_mac(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!ve_is_super(dev->owner_env) && + !veth_from_netdev(dev)->allow_mac_change) + return -EPERM; + if (netif_running(dev)) + return -EBUSY; + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + + return 0; +} + +int veth_init_dev(struct net_device *dev) +{ + veth_from_netdev(dev)->real_stats = + alloc_percpu(struct net_device_stats); + if (veth_from_netdev(dev)->real_stats == NULL) + return -ENOMEM; + + return 0; +} + +static int +veth_set_op(struct net_device *dev, u32 data, + int (*fop)(struct net_device *, u32)) +{ + struct net_device *pair; + int ret = 0; + + ret = fop(dev, data); + if (ret < 0) + goto out; + + pair = veth_from_netdev(dev)->pair; + if (pair) + ret = fop(pair, data); +out: + return ret; +} + +static int veth_op_set_sg(struct net_device *dev, u32 data) +{ + return veth_set_op(dev, data, ethtool_op_set_sg); +} + +static int veth_op_set_tx_csum(struct net_device *dev, u32 data) +{ + return veth_set_op(dev, data, ethtool_op_set_tx_csum); +} + +static int +veth_op_set_tso(struct net_device *dev, u32 data) +{ + return veth_set_op(dev, data, ethtool_op_set_tso); +} + +#define veth_op_set_rx_csum veth_op_set_tx_csum + +static struct ethtool_ops veth_ethtool_ops = { + .get_sg = ethtool_op_get_sg, + .set_sg = veth_op_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = veth_op_set_tx_csum, + .get_rx_csum = ethtool_op_get_tx_csum, + .set_rx_csum = veth_op_set_rx_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = veth_op_set_tso, +}; + +static void veth_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx) +{ + struct cpt_veth_image v; + struct veth_struct *veth; + + veth = veth_from_netdev(dev); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_VETH; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_allow_mac_change = veth->allow_mac_change; + + ops->write(&v, sizeof(v), ctx); +} + +static int veth_rst(loff_t pos, struct cpt_netdev_image *di, + struct rst_ops *ops, + struct cpt_context *ctx) + +{ + int err; + struct cpt_veth_image vi; + struct veth_struct *veth; + struct net_device *dev; + + pos = pos + di->cpt_hdrlen; + err = ops->get_object(CPT_OBJ_NET_VETH, pos, + &vi, sizeof(vi), ctx); + if (err) + return err; + + dev = __dev_get_by_name(get_exec_env()->ve_ns->net_ns, di->cpt_name); + if (dev == NULL) + return -ENODEV; + + veth = veth_from_netdev(dev); + veth->allow_mac_change = vi.cpt_allow_mac_change; + + return 0; +} + +static struct netdev_rst veth_netdev_rst = { + .cpt_object = CPT_OBJ_NET_VETH, + .ndo_rst = veth_rst, +}; + +static const struct net_device_ops veth_ops = { + .ndo_init = veth_init_dev, + .ndo_start_xmit = veth_xmit, + .ndo_get_stats = get_stats, + .ndo_open = veth_open, + .ndo_stop = veth_close, + .ndo_set_mac_address = veth_set_mac, + .ndo_cpt = veth_cpt, +}; + +static void veth_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->netdev_ops = &veth_ops; + dev->destructor = veth_destructor; + dev->tx_queue_len = 0; + + /* + * No other features, as they are: + * - checksumming is required, and nobody else will done our job + */ + dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX | + NETIF_F_HIGHDMA; + + SET_ETHTOOL_OPS(dev, &veth_ethtool_ops); +} + +#ifdef CONFIG_PROC_FS +#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x" +#define ADDR_ARG(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5] +static int vehwaddr_seq_show(struct seq_file *m, void *v) +{ + struct list_head *p; + struct veth_struct *entry; + + p = (struct list_head *)v; + if (p == &veth_hwaddr_list) { + seq_puts(m, "Version: 1.0\n"); + return 0; + } + entry = list_entry(p, struct veth_struct, hwaddr_list); + seq_printf(m, ADDR_FMT " %16s ", + ADDR_ARG(entry->pair->dev_addr), entry->pair->name); + seq_printf(m, ADDR_FMT " %16s %10u %5s\n", + ADDR_ARG(veth_to_netdev(entry)->dev_addr), + veth_to_netdev(entry)->name, + VEID(veth_to_netdev(entry)->owner_env), + entry->allow_mac_change ? "allow" : "deny"); + return 0; +} + +static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos) +{ + read_lock(&ve_hwaddr_lock); + return seq_list_start_head(&veth_hwaddr_list, *pos); +} + +static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &veth_hwaddr_list, pos); +} + +static void vehwaddr_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_hwaddr_lock); +} + +static struct seq_operations vehwaddr_seq_op = { + .start = vehwaddr_seq_start, + .next = vehwaddr_seq_next, + .stop = vehwaddr_seq_stop, + .show = vehwaddr_seq_show, +}; + +static int vehwaddr_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vehwaddr_seq_op); +} + +static struct file_operations proc_vehwaddr_operations = { + .open = vehwaddr_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + +int real_ve_hwaddr(envid_t veid, int op, + unsigned char *dev_addr, int addrlen, char *name, + unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve) +{ + int err; + struct ve_struct *ve; + char ve_addr[ETH_ALEN]; + + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto out; + + err = -EINVAL; + switch (op) { + case VE_ETH_ADD: + if (addrlen != ETH_ALEN) + goto out; + if (addrlen_ve != ETH_ALEN && addrlen_ve != 0) + goto out; + /* If ve addr is not set then we use dev_addr[3] & 0x80 for it */ + if (addrlen_ve == 0 && (dev_addr[3] & 0x80)) + goto out; + if (addrlen_ve == 0) { + memcpy(ve_addr, dev_addr, ETH_ALEN); + ve_addr[3] |= 0x80; + } else { + memcpy(ve_addr, dev_addr_ve, ETH_ALEN); + } + + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veth_entry_add(ve, dev_addr, name, ve_addr, name_ve); + up_read(&ve->op_sem); + put_ve(ve); + break; + + case VE_ETH_DEL: + if (name[0] == '\0') + goto out; + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veth_entry_del(ve, name); + up_read(&ve->op_sem); + put_ve(ve); + break; + case VE_ETH_ALLOW_MAC_CHANGE: + case VE_ETH_DENY_MAC_CHANGE: + err = veth_allow_change_mac(veid, name, + op == VE_ETH_ALLOW_MAC_CHANGE); + break; + } + +out: + return err; +} + +int veth_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VETHCTL_VE_HWADDR: { + struct vzctl_ve_hwaddr s; + + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_ve_hwaddr(s.veid, s.op, s.dev_addr, s.addrlen, + s.dev_name, s.dev_addr_ve, s.addrlen_ve, + s.dev_name_ve); + } + break; + } + return err; +} + +static struct vzioctlinfo vethcalls = { + .type = VETHCTLTYPE, + .ioctl = veth_ioctl, + .compat_ioctl = veth_ioctl, + .owner = THIS_MODULE, +}; + +struct net_device * veth_dev_start(char *dev_addr, char *name) +{ + struct net_device *dev; + int err; + + if (!is_valid_ether_addr(dev_addr)) + return ERR_PTR(-EADDRNOTAVAIL); + + dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup); + if (!dev) + return ERR_PTR(-ENOMEM); + dev->nd_net = get_exec_env()->ve_netns; + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto err; + } + if ((err = register_netdev(dev)) != 0) + goto err; + + memcpy(dev->dev_addr, dev_addr, ETH_ALEN); + dev->addr_len = ETH_ALEN; + + return dev; +err: + free_netdev(dev); + printk(KERN_ERR "%s initialization error err=%d\n", name, err); + return ERR_PTR(err); +} + +static int veth_start(void *data) +{ + return 0; +} + +static void veth_stop(void *data) +{ + struct ve_struct *env; + struct veth_struct *entry, *tmp; + + env = (struct ve_struct *)data; + down(&hwaddr_sem); + list_for_each_entry_safe(entry, tmp, &veth_hwaddr_list, hwaddr_list) + if (VEID(env) == VEID(veth_to_netdev(entry)->owner_env)) + veth_pair_del(env, entry); + up(&hwaddr_sem); +} + +static struct ve_hook veth_ve_hook = { + .init = veth_start, + .fini = veth_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; + +__init int veth_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *de; + + de = proc_create("veth", S_IFREG|S_IRUSR, proc_vz_dir, + &proc_vehwaddr_operations); + if (de == NULL) + printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n"); +#endif + + register_netdev_rst(&veth_netdev_rst); + ve_hook_register(VE_SS_CHAIN, &veth_ve_hook); + vzioctl_register(&vethcalls); + return 0; +} + +__exit void veth_exit(void) +{ + struct veth_struct *entry; + struct list_head *tmp, *n; + struct ve_struct *ve; + + vzioctl_unregister(&vethcalls); + ve_hook_unregister(&veth_ve_hook); + unregister_netdev_rst(&veth_netdev_rst); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("veth", proc_vz_dir); +#endif + + down(&hwaddr_sem); + list_for_each_safe(tmp, n, &veth_hwaddr_list) { + entry = list_entry(tmp, struct veth_struct, hwaddr_list); + ve = get_ve(veth_to_netdev(entry)->owner_env); + + veth_pair_del(ve, entry); + + put_ve(ve); + } + up(&hwaddr_sem); +} + +module_init(veth_init); +module_exit(veth_exit); + +MODULE_AUTHOR("Andrey Mirkin "); +MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device"); +MODULE_LICENSE("GPL v2"); + diff -urNp linux-2.6.32.48/drivers/scsi/hosts.c linux-2.6.32.48-openvz/drivers/scsi/hosts.c --- linux-2.6.32.48/drivers/scsi/hosts.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/scsi/hosts.c 2011-11-21 17:40:45.000000000 -0500 @@ -401,9 +401,8 @@ struct Scsi_Host *scsi_host_alloc(struct device_initialize(&shost->shost_gendev); dev_set_name(&shost->shost_gendev, "host%d", shost->host_no); -#ifndef CONFIG_SYSFS_DEPRECATED - shost->shost_gendev.bus = &scsi_bus_type; -#endif + if (!sysfs_deprecated) + shost->shost_gendev.bus = &scsi_bus_type; shost->shost_gendev.type = &scsi_host_type; device_initialize(&shost->shost_dev); diff -urNp linux-2.6.32.48/drivers/scsi/scsi_scan.c linux-2.6.32.48-openvz/drivers/scsi/scsi_scan.c --- linux-2.6.32.48/drivers/scsi/scsi_scan.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/drivers/scsi/scsi_scan.c 2011-11-21 17:40:45.000000000 -0500 @@ -415,9 +415,8 @@ static struct scsi_target *scsi_alloc_ta starget->reap_ref = 1; dev->parent = get_device(parent); dev_set_name(dev, "target%d:%d:%d", shost->host_no, channel, id); -#ifndef CONFIG_SYSFS_DEPRECATED - dev->bus = &scsi_bus_type; -#endif + if (!sysfs_deprecated) + dev->bus = &scsi_bus_type; dev->type = &scsi_target_type; starget->id = id; starget->channel = channel; diff -urNp linux-2.6.32.48/fs/aio.c linux-2.6.32.48-openvz/fs/aio.c --- linux-2.6.32.48/fs/aio.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/aio.c 2011-11-21 17:40:45.000000000 -0500 @@ -43,13 +43,16 @@ #endif /*------ sysctl variables----*/ -static DEFINE_SPINLOCK(aio_nr_lock); +DEFINE_SPINLOCK(aio_nr_lock); +EXPORT_SYMBOL_GPL(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ +EXPORT_SYMBOL_GPL(aio_nr); unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ static struct kmem_cache *kiocb_cachep; -static struct kmem_cache *kioctx_cachep; +struct kmem_cache *kioctx_cachep; +EXPORT_SYMBOL_GPL(kioctx_cachep); static struct workqueue_struct *aio_wq; @@ -60,7 +63,7 @@ static DECLARE_WORK(fput_work, aio_fput_ static DEFINE_SPINLOCK(fput_lock); static LIST_HEAD(fput_head); -static void aio_kick_handler(struct work_struct *); +void aio_kick_handler(struct work_struct *); static void aio_queue_work(struct kioctx *); /* aio_setup @@ -343,7 +346,7 @@ static void aio_cancel_all(struct kioctx spin_unlock_irq(&ctx->ctx_lock); } -static void wait_for_all_aios(struct kioctx *ctx) +void wait_for_all_aios(struct kioctx *ctx) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -366,6 +369,7 @@ static void wait_for_all_aios(struct kio out: spin_unlock_irq(&ctx->ctx_lock); } +EXPORT_SYMBOL_GPL(wait_for_all_aios); /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. @@ -818,7 +822,7 @@ static inline void aio_run_all_iocbs(str * space. * Run on aiod's context. */ -static void aio_kick_handler(struct work_struct *work) +void aio_kick_handler(struct work_struct *work) { struct kioctx *ctx = container_of(work, struct kioctx, wq.work); mm_segment_t oldfs = get_fs(); @@ -839,7 +843,7 @@ static void aio_kick_handler(struct work if (requeue) queue_delayed_work(aio_wq, &ctx->wq, 0); } - +EXPORT_SYMBOL_GPL(aio_kick_handler); /* * Called by kick_iocb to queue the kiocb for retry diff -urNp linux-2.6.32.48/fs/autofs/init.c linux-2.6.32.48-openvz/fs/autofs/init.c --- linux-2.6.32.48/fs/autofs/init.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs/init.c 2011-11-21 17:40:45.000000000 -0500 @@ -25,6 +25,7 @@ static struct file_system_type autofs_fs .name = "autofs", .get_sb = autofs_get_sb, .kill_sb = autofs_kill_sb, + .fs_flags = FS_VIRTUALIZED, }; static int __init init_autofs_fs(void) diff -urNp linux-2.6.32.48/fs/autofs/inode.c linux-2.6.32.48-openvz/fs/autofs/inode.c --- linux-2.6.32.48/fs/autofs/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -78,7 +78,7 @@ static int parse_options(char *options, *uid = current_uid(); *gid = current_gid(); - *pgrp = task_pgrp_nr(current); + *pgrp = task_pgrp_vnr(current); *minproto = *maxproto = AUTOFS_PROTO_VERSION; diff -urNp linux-2.6.32.48/fs/autofs/root.c linux-2.6.32.48-openvz/fs/autofs/root.c --- linux-2.6.32.48/fs/autofs/root.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs/root.c 2011-11-21 17:40:45.000000000 -0500 @@ -362,7 +362,7 @@ static int autofs_root_unlink(struct ino /* This allows root to remove symlinks */ lock_kernel(); - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) { + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) { unlock_kernel(); return -EACCES; } @@ -556,7 +556,7 @@ static int autofs_root_ioctl(struct inod _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; switch(cmd) { diff -urNp linux-2.6.32.48/fs/autofs4/autofs_i.h linux-2.6.32.48-openvz/fs/autofs4/autofs_i.h --- linux-2.6.32.48/fs/autofs4/autofs_i.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs4/autofs_i.h 2011-11-21 17:40:45.000000000 -0500 @@ -119,7 +119,7 @@ struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; - pid_t oz_pgrp; + struct pid *oz_pgrp; int catatonic; int version; int sub_version; @@ -136,6 +136,7 @@ struct autofs_sb_info { spinlock_t lookup_lock; struct list_head active_list; struct list_head expiring_list; + unsigned is32bit:1; }; static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) @@ -153,7 +154,7 @@ static inline struct autofs_info *autofs filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff -urNp linux-2.6.32.48/fs/autofs4/dev-ioctl.c linux-2.6.32.48-openvz/fs/autofs4/dev-ioctl.c --- linux-2.6.32.48/fs/autofs4/dev-ioctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs4/dev-ioctl.c 2011-11-21 17:40:45.000000000 -0500 @@ -385,7 +385,8 @@ static int autofs_dev_ioctl_setpipefd(st fput(pipe); goto out; } - sbi->oz_pgrp = task_pgrp_nr(current); + put_pid(sbi->oz_pgrp); + sbi->oz_pgrp = get_pid(task_pgrp(current)); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; diff -urNp linux-2.6.32.48/fs/autofs4/init.c linux-2.6.32.48-openvz/fs/autofs4/init.c --- linux-2.6.32.48/fs/autofs4/init.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs4/init.c 2011-11-21 17:40:45.000000000 -0500 @@ -25,6 +25,7 @@ static struct file_system_type autofs_fs .name = "autofs", .get_sb = autofs_get_sb, .kill_sb = autofs4_kill_sb, + .fs_flags = FS_VIRTUALIZED, }; static int __init init_autofs4_fs(void) diff -urNp linux-2.6.32.48/fs/autofs4/inode.c linux-2.6.32.48-openvz/fs/autofs4/inode.c --- linux-2.6.32.48/fs/autofs4/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs4/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -171,6 +171,8 @@ void autofs4_kill_sb(struct super_block /* Clean up and release dangling references */ autofs4_force_release(sbi); + put_pid(sbi->oz_pgrp); + sb->s_fs_info = NULL; kfree(sbi); @@ -192,7 +194,7 @@ static int autofs4_show_options(struct s seq_printf(m, ",uid=%u", root_inode->i_uid); if (root_inode->i_gid != 0) seq_printf(m, ",gid=%u", root_inode->i_gid); - seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); @@ -237,7 +239,7 @@ static int parse_options(char *options, *uid = current_uid(); *gid = current_gid(); - *pgrp = task_pgrp_nr(current); + *pgrp = task_pgrp_vnr(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -322,6 +324,7 @@ int autofs4_fill_super(struct super_bloc int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; + pid_t pgrp; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -334,13 +337,16 @@ int autofs4_fill_super(struct super_bloc sbi->pipe = NULL; sbi->catatonic = 1; sbi->exp_timeout = 0; - sbi->oz_pgrp = task_pgrp_nr(current); sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; +#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + sbi->is32bit = 1; +#endif mutex_init(&sbi->wq_mutex); spin_lock_init(&sbi->fs_lock); sbi->queues = NULL; @@ -373,7 +379,7 @@ int autofs4_fill_super(struct super_bloc /* Can this call block? */ if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, + &pgrp, &sbi->type, &sbi->min_proto, &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; @@ -401,12 +407,20 @@ int autofs4_fill_super(struct super_bloc sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pgrp); + + sbi->oz_pgrp = find_get_pid(pgrp); + + if (!sbi->oz_pgrp) { + printk("autofs: could not find process group %d\n", pgrp); + goto fail_dput; + } + pipe = fget(pipefd); if (!pipe) { printk("autofs: could not open pipe file descriptor\n"); - goto fail_dput; + goto fail_put_pid; } if (!pipe->f_op || !pipe->f_op->write) goto fail_fput; @@ -427,6 +441,8 @@ fail_fput: printk("autofs: pipe file descriptor does not contain proper ops\n"); fput(pipe); /* fall through */ +fail_put_pid: + put_pid(sbi->oz_pgrp); fail_dput: dput(root); goto fail_free; diff -urNp linux-2.6.32.48/fs/autofs4/root.c linux-2.6.32.48-openvz/fs/autofs4/root.c --- linux-2.6.32.48/fs/autofs4/root.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs4/root.c 2011-11-21 17:40:45.000000000 -0500 @@ -689,7 +689,7 @@ static int autofs4_dir_unlink(struct ino struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EACCES; if (atomic_dec_and_test(&ino->count)) { @@ -883,7 +883,7 @@ static int autofs4_root_ioctl(struct ino _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; switch(cmd) { diff -urNp linux-2.6.32.48/fs/autofs4/waitq.c linux-2.6.32.48-openvz/fs/autofs4/waitq.c --- linux-2.6.32.48/fs/autofs4/waitq.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/autofs4/waitq.c 2011-11-21 17:40:45.000000000 -0500 @@ -148,6 +148,16 @@ static void autofs4_notify_daemon(struct struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; pktsz = sizeof(*packet); +#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION + /* + * On x86_64 autofs_v5_packet struct padded with 4 bytes + * it broke autofs daemon worked in ia32 emulation mode + * + * reduce size if work in 32-bit mode to satisfy userspace hope + */ + if (sbi->is32bit) + pktsz -= 4; +#endif packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; diff -urNp linux-2.6.32.48/fs/binfmt_aout.c linux-2.6.32.48-openvz/fs/binfmt_aout.c --- linux-2.6.32.48/fs/binfmt_aout.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/binfmt_aout.c 2011-11-21 17:40:45.000000000 -0500 @@ -311,12 +311,12 @@ static int load_aout_binary(struct linux if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) { - printk(KERN_NOTICE "executable not page aligned\n"); + ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n"); } if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) { - printk(KERN_WARNING + ve_printk(VE_LOG, KERN_WARNING "fd_offset is not page aligned. Please convert program: %s\n", bprm->file->f_path.dentry->d_name.name); } @@ -425,7 +425,7 @@ static int load_aout_library(struct file if (printk_ratelimit()) { - printk(KERN_WARNING + ve_printk(VE_LOG, KERN_WARNING "N_TXTOFF is not page aligned. Please convert library: %s\n", file->f_path.dentry->d_name.name); } diff -urNp linux-2.6.32.48/fs/binfmt_elf.c linux-2.6.32.48-openvz/fs/binfmt_elf.c --- linux-2.6.32.48/fs/binfmt_elf.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/binfmt_elf.c 2011-11-21 17:40:45.000000000 -0500 @@ -437,7 +437,7 @@ static unsigned long load_elf_interp(str eppnt = elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { - int elf_type = MAP_PRIVATE | MAP_DENYWRITE; + int elf_type = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECPRIO; int elf_prot = 0; unsigned long vaddr = 0; unsigned long k, map_addr; @@ -789,7 +789,8 @@ static int load_elf_binary(struct linux_ if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; - elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + elf_flags = MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_EXECPRIO; vaddr = elf_ppnt->p_vaddr; if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { @@ -922,7 +923,7 @@ static int load_elf_binary(struct linux_ set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, !!elf_interpreter); + retval = arch_setup_additional_pages(bprm, !!elf_interpreter, 0); if (retval < 0) { send_sig(SIGKILL, current, 0); goto out; diff -urNp linux-2.6.32.48/fs/binfmt_misc.c linux-2.6.32.48-openvz/fs/binfmt_misc.c --- linux-2.6.32.48/fs/binfmt_misc.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/binfmt_misc.c 2011-11-21 17:40:45.000000000 -0500 @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -35,8 +36,15 @@ enum { VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ }; +#ifdef CONFIG_VE +#define bm_entries(ve) ((ve)->bm_entries) +#define bm_enabled(ve) ((ve)->bm_enabled) +#else static LIST_HEAD(entries); static int enabled = 1; +#define bm_entries(ve) (entries) +#define bm_enabled(ve) (enabled) +#endif enum {Enabled, Magic}; #define MISC_FMT_PRESERVE_ARGV0 (1<<31) @@ -56,21 +64,30 @@ typedef struct { } Node; static DEFINE_RWLOCK(entries_lock); +#ifdef CONFIG_VE +#define bm_fs_type(ve) (*(ve)->bm_fs_type) +#define bm_mnt(ve) ((ve)->bm_mnt) +#define bm_entry_count(ve) ((ve)->bm_entry_count) +#else static struct file_system_type bm_fs_type; static struct vfsmount *bm_mnt; static int entry_count; +#define bm_fs_type(ve) (bm_fs_type) +#define bm_mnt(ve) (bm_mnt) +#define bm_entry_count(ve) (bm_entry_count) +#endif /* * Check if we support the binfmt * if we do, return the node, else NULL * locking is done in load_misc_binary */ -static Node *check_file(struct linux_binprm *bprm) +static Node *check_file(struct ve_struct *ve, struct linux_binprm *bprm) { char *p = strrchr(bprm->interp, '.'); struct list_head *l; - list_for_each(l, &entries) { + list_for_each(l, &bm_entries(ve)) { Node *e = list_entry(l, Node, list); char *s; int j; @@ -111,9 +128,10 @@ static int load_misc_binary(struct linux char *iname_addr = iname; int retval; int fd_binary = -1; + struct ve_struct *ve = get_exec_env(); retval = -ENOEXEC; - if (!enabled) + if (!bm_enabled(ve)) goto _ret; retval = -ENOEXEC; @@ -122,7 +140,7 @@ static int load_misc_binary(struct linux /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); - fmt = check_file(bprm); + fmt = check_file(ve, bprm); if (fmt) strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); read_unlock(&entries_lock); @@ -507,7 +525,7 @@ static void bm_clear_inode(struct inode kfree(inode->i_private); } -static void kill_node(Node *e) +static void kill_node(struct ve_struct *ve, Node *e) { struct dentry *dentry; @@ -523,7 +541,7 @@ static void kill_node(Node *e) dentry->d_inode->i_nlink--; d_drop(dentry); dput(dentry); - simple_release_fs(&bm_mnt, &entry_count); + simple_release_fs(&bm_mnt(ve), &bm_entry_count(ve)); } } @@ -562,7 +580,7 @@ static ssize_t bm_entry_write(struct fil case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); mutex_lock(&root->d_inode->i_mutex); - kill_node(e); + kill_node(get_exec_env(), e); mutex_unlock(&root->d_inode->i_mutex); dput(root); @@ -587,6 +605,7 @@ static ssize_t bm_register_write(struct struct dentry *root, *dentry; struct super_block *sb = file->f_path.mnt->mnt_sb; int err = 0; + struct ve_struct *ve = get_exec_env(); e = create_entry(buffer, count); @@ -610,7 +629,7 @@ static ssize_t bm_register_write(struct if (!inode) goto out2; - err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count); + err = simple_pin_fs(&bm_fs_type(ve), &bm_mnt(ve), &bm_entry_count(ve)); if (err) { iput(inode); inode = NULL; @@ -623,7 +642,7 @@ static ssize_t bm_register_write(struct d_instantiate(dentry, inode); write_lock(&entries_lock); - list_add(&e->list, &entries); + list_add(&e->list, &bm_entries(ve)); write_unlock(&entries_lock); err = 0; @@ -649,26 +668,31 @@ static const struct file_operations bm_r static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - char *s = enabled ? "enabled\n" : "disabled\n"; + struct ve_struct *ve = get_exec_env(); + char *s = bm_enabled(ve) ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } +static void dm_genocide(struct ve_struct *ve) +{ + while (!list_empty(&bm_entries(ve))) + kill_node(ve, list_entry(bm_entries(ve).next, Node, list)); +} + static ssize_t bm_status_write(struct file * file, const char __user * buffer, size_t count, loff_t *ppos) { + struct ve_struct *ve = get_exec_env(); int res = parse_command(buffer, count); struct dentry *root; switch (res) { - case 1: enabled = 0; break; - case 2: enabled = 1; break; + case 1: bm_enabled(ve) = 0; break; + case 2: bm_enabled(ve) = 1; break; case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); mutex_lock(&root->d_inode->i_mutex); - - while (!list_empty(&entries)) - kill_node(list_entry(entries.next, Node, list)); - + dm_genocide(ve); mutex_unlock(&root->d_inode->i_mutex); dput(root); default: return res; @@ -719,6 +743,53 @@ static struct file_system_type bm_fs_typ .kill_sb = kill_litter_super, }; +#ifdef CONFIG_VE +static void __ve_binfmt_init(struct ve_struct *ve, struct file_system_type *fs) +{ + ve->bm_fs_type = fs; + INIT_LIST_HEAD(&ve->bm_entries); + ve->bm_enabled = 1; + ve->bm_mnt = NULL; + ve->bm_entry_count = 0; +} + +static int ve_binfmt_init(void *x) +{ + struct ve_struct *ve = x; + struct file_system_type *fs_type; + int err; + + err = register_ve_fs_type(ve, &bm_fs_type, &fs_type, NULL); + if (err == 0) + __ve_binfmt_init(ve, fs_type); + + return err; +} + +static void ve_binfmt_fini(void *x) +{ + struct ve_struct *ve = x; + + /* + * no locks since exec_ve is dead and noone will + * mess with bm_xxx fields any longer + */ + if (!ve->bm_fs_type) + return; + dm_genocide(ve); + unregister_ve_fs_type(ve->bm_fs_type, NULL); + kfree(ve->bm_fs_type); + ve->bm_fs_type = NULL; +} + +static struct ve_hook ve_binfmt_hook = { + .init = ve_binfmt_init, + .fini = ve_binfmt_fini, + .priority = HOOK_PRIO_FS, + .owner = THIS_MODULE, +}; +#endif + static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); @@ -727,11 +798,17 @@ static int __init init_misc_binfmt(void) if (err) unregister_filesystem(&bm_fs_type); } + + if (!err) { + __ve_binfmt_init(get_ve0(), &bm_fs_type); + ve_hook_register(VE_SS_CHAIN, &ve_binfmt_hook); + } return err; } static void __exit exit_misc_binfmt(void) { + ve_hook_unregister(&ve_binfmt_hook); unregister_binfmt(&misc_format); unregister_filesystem(&bm_fs_type); } diff -urNp linux-2.6.32.48/fs/block_dev.c linux-2.6.32.48-openvz/fs/block_dev.c --- linux-2.6.32.48/fs/block_dev.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/block_dev.c 2011-11-21 17:40:45.000000000 -0500 @@ -1604,7 +1604,7 @@ int __invalidate_device(struct block_dev * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes_check(sb, 1); drop_super(sb); } invalidate_bdev(bdev); diff -urNp linux-2.6.32.48/fs/buffer.c linux-2.6.32.48-openvz/fs/buffer.c --- linux-2.6.32.48/fs/buffer.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/buffer.c 2011-11-21 17:40:45.000000000 -0500 @@ -671,14 +671,18 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { + int acct = 0; + spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); + acct = account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&mapping->tree_lock); + if (acct) + task_io_account_write(page, PAGE_CACHE_SIZE, 0); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff -urNp linux-2.6.32.48/fs/compat.c linux-2.6.32.48-openvz/fs/compat.c --- linux-2.6.32.48/fs/compat.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/compat.c 2011-11-21 17:40:45.000000000 -0500 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,18 @@ int compat_printk(const char *fmt, ...) #include "read_write.h" +int ve_compat_printk(int dst, const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = ve_vprintk(dst, fmt, ap); + va_end(ap); + return ret; +} + /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -91,6 +104,21 @@ asmlinkage long compat_sys_utime(char __ return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } +asmlinkage long compat_sys_lutime(char __user * filename, + struct compat_utimbuf __user *t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t->actime) || + get_user(tv[1].tv_sec, &t->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, t ? tv : NULL, AT_SYMLINK_NOFOLLOW); +} + asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags) { struct timespec tv[2]; @@ -269,6 +297,8 @@ asmlinkage long compat_sys_statfs(const struct kstatfs tmp; error = vfs_statfs(path.dentry, &tmp); if (!error) + error = faudit_statfs(path.mnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs(buf, &tmp); path_put(&path); } @@ -287,6 +317,8 @@ asmlinkage long compat_sys_fstatfs(unsig goto out; error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) + error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs(buf, &tmp); fput(file); out: @@ -337,6 +369,8 @@ asmlinkage long compat_sys_statfs64(cons struct kstatfs tmp; error = vfs_statfs(path.dentry, &tmp); if (!error) + error = faudit_statfs(path.mnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs64(buf, &tmp); path_put(&path); } @@ -358,6 +392,8 @@ asmlinkage long compat_sys_fstatfs64(uns goto out; error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) + error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs64(buf, &tmp); fput(file); out: @@ -1469,6 +1505,10 @@ int compat_do_execve(char * filename, bool clear_in_exec; int retval; + retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); + if (retval) + return retval; + retval = unshare_files(&displaced); if (retval) goto out_ret; diff -urNp linux-2.6.32.48/fs/compat_ioctl.c linux-2.6.32.48-openvz/fs/compat_ioctl.c --- linux-2.6.32.48/fs/compat_ioctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/compat_ioctl.c 2011-11-21 17:40:45.000000000 -0500 @@ -2753,7 +2753,7 @@ static void compat_ioctl_error(struct fi sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK); if (!isprint(buf[1])) sprintf(buf, "%02x", buf[1]); - compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " + ve_compat_printk(VE_LOG, "ioctl32(%s:%d): Unknown cmd fd(%d) " "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n", current->comm, current->pid, (int)fd, (unsigned int)cmd, buf, diff -urNp linux-2.6.32.48/fs/dcache.c linux-2.6.32.48-openvz/fs/dcache.c --- linux-2.6.32.48/fs/dcache.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/dcache.c 2011-11-21 17:40:45.000000000 -0500 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -33,8 +34,15 @@ #include #include #include +#include +#include +#include +#include #include "internal.h" +#include +#include + int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); @@ -43,7 +51,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOC EXPORT_SYMBOL(dcache_lock); -static struct kmem_cache *dentry_cache __read_mostly; +struct kmem_cache *dentry_cache __read_mostly; #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) @@ -173,6 +181,7 @@ static struct dentry *d_kill(struct dent list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ + preempt_enable_no_resched(); /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); if (IS_ROOT(dentry)) @@ -220,15 +229,22 @@ void dput(struct dentry *dentry) repeat: if (atomic_read(&dentry->d_count) == 1) might_sleep(); - if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) - return; + preempt_disable(); + if (unlikely(ub_dentry_on)) { + spin_lock(&dcache_lock); + if (!atomic_dec_and_test(&dentry->d_count)) { + ub_dentry_uncharge_locked(dentry); + spin_unlock(&dcache_lock); + goto out_preempt; + } + } else { + if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) + goto out_preempt; + } spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - return; - } + if (atomic_read(&dentry->d_count)) + goto out_unlock; /* * AV: ->d_delete() is _NOT_ allowed to block now. @@ -244,8 +260,12 @@ repeat: dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); } +out_unlock: spin_unlock(&dentry->d_lock); + ub_dentry_uncharge_locked(dentry); spin_unlock(&dcache_lock); +out_preempt: + preempt_enable(); return; unhash_it: @@ -253,9 +273,21 @@ unhash_it: kill_it: /* if dentry was on the d_lru list delete it from there */ dentry_lru_del(dentry); + + if (unlikely(ub_dentry_on)) { + struct user_beancounter *ub; + + ub = dentry->dentry_bc.d_ub; + BUG_ON(!ub_dput_testzero(dentry)); + uncharge_dcache(ub, dentry->dentry_bc.d_ubsize); + put_beancounter(ub); + } + dentry = d_kill(dentry); - if (dentry) + preempt_disable(); + if (dentry) goto repeat; + preempt_enable(); } /** @@ -321,6 +353,7 @@ static inline struct dentry * __dget_loc { atomic_inc(&dentry->d_count); dentry_lru_del_init(dentry); + ub_dentry_charge_nofail(dentry); return dentry; } @@ -423,6 +456,7 @@ static void prune_one_dentry(struct dent __acquires(dcache_lock) { __d_drop(dentry); + preempt_disable(); dentry = d_kill(dentry); /* @@ -438,6 +472,7 @@ static void prune_one_dentry(struct dent dentry->d_op->d_delete(dentry); dentry_lru_del_init(dentry); __d_drop(dentry); + preempt_disable(); dentry = d_kill(dentry); spin_lock(&dcache_lock); } @@ -732,6 +767,8 @@ void shrink_dcache_for_umount(struct sup dentry = sb->s_root; sb->s_root = NULL; + /* "/" was also charged in d_alloc_root() */ + ub_dentry_uncharge(dentry); atomic_dec(&dentry->d_count); shrink_dcache_for_umount_subtree(dentry); @@ -891,12 +928,18 @@ void shrink_dcache_parent(struct dentry */ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) { + int res = -1; + + KSTAT_PERF_ENTER(shrink_dcache) if (nr) { if (!(gfp_mask & __GFP_FS)) - return -1; + goto out; prune_dcache(nr); } - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; +out: + KSTAT_PERF_LEAVE(shrink_dcache) + return res; } static struct shrinker dcache_shrinker = { @@ -919,21 +962,27 @@ struct dentry *d_alloc(struct dentry * p struct dentry *dentry; char *dname; + dname = NULL; + if (name->len > DNAME_INLINE_LEN-1) { + dname = kmalloc(name->len + 1, GFP_KERNEL); + if (!dname) + goto err_name; + } + + ub_dentry_alloc_start(); + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) - return NULL; + goto err_alloc; - if (name->len > DNAME_INLINE_LEN-1) { - dname = kmalloc(name->len + 1, GFP_KERNEL); - if (!dname) { - kmem_cache_free(dentry_cache, dentry); - return NULL; - } - } else { + preempt_disable(); + if (dname == NULL) dname = dentry->d_iname; - } dentry->d_name.name = dname; + if (ub_dentry_alloc(dentry)) + goto err_charge; + dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); @@ -961,12 +1010,27 @@ struct dentry *d_alloc(struct dentry * p } spin_lock(&dcache_lock); - if (parent) + if (parent) { list_add(&dentry->d_u.d_child, &parent->d_subdirs); + if (parent->d_flags & DCACHE_VIRTUAL) + dentry->d_flags |= DCACHE_VIRTUAL; + } dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); + preempt_enable(); + ub_dentry_alloc_end(); return dentry; + +err_charge: + preempt_enable(); + kmem_cache_free(dentry_cache, dentry); +err_alloc: + if (name->len > DNAME_INLINE_LEN - 1) + kfree(dname); + ub_dentry_alloc_end(); +err_name: + return NULL; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) @@ -1366,12 +1430,12 @@ struct dentry * __d_lookup(struct dentry unsigned int hash = name->hash; const unsigned char *str = name->name; struct hlist_head *head = d_hash(parent,hash); - struct dentry *found = NULL; struct hlist_node *node; - struct dentry *dentry; + struct dentry *dentry, *found; rcu_read_lock(); + found = NULL; hlist_for_each_entry_rcu(dentry, node, head, d_hash) { struct qstr *qstr; @@ -1411,6 +1475,10 @@ struct dentry * __d_lookup(struct dentry atomic_inc(&dentry->d_count); found = dentry; + + if (ub_dentry_charge(found)) + goto charge_failure; + spin_unlock(&dentry->d_lock); break; next: @@ -1419,6 +1487,14 @@ next: rcu_read_unlock(); return found; + +charge_failure: + spin_unlock(&found->d_lock); + rcu_read_unlock(); + /* dentry is now unhashed, just kill it */ + dput(found); + /* ... and fail lookup */ + return NULL; } /** @@ -1887,6 +1963,16 @@ static int prepend_name(char **buffer, i } /** + * d_root_check - checks if dentry is accessible from current's fs root + * @dentry: dentry to be verified + * @vfsmnt: vfsmnt to which the dentry belongs + */ +int d_root_check(struct path *path) +{ + return PTR_ERR(d_path(path, NULL, 0)); +} + +/** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry (may be modified by this function) @@ -1911,18 +1997,21 @@ char *__d_path(const struct path *path, struct vfsmount *vfsmnt = path->mnt; char *end = buffer + buflen; char *retval; + int deleted; + struct vfsmount *oldmnt = vfsmnt; spin_lock(&vfsmount_lock); - prepend(&end, &buflen, "\0", 1); - if (d_unlinked(dentry) && - (prepend(&end, &buflen, " (deleted)", 10) != 0)) + if (buffer) { + prepend(&end, &buflen, "\0", 1); + if (buflen < 1) goto Elong; + } + deleted = (!IS_ROOT(dentry) && d_unhashed(dentry)); - if (buflen < 1) - goto Elong; /* Get '/' right */ retval = end-1; - *retval = '/'; + if (buffer) + *retval = '/'; for (;;) { struct dentry * parent; @@ -1940,20 +2029,43 @@ char *__d_path(const struct path *path, } parent = dentry->d_parent; prefetch(parent); - if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || - (prepend(&end, &buflen, "/", 1) != 0)) + if (buffer && ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || + (prepend(&end, &buflen, "/", 1) != 0))) goto Elong; retval = end; dentry = parent; } out: + if (deleted && buffer && + prepend(&end, &buflen, " (deleted)", 10) != 0) + goto Elong; + spin_unlock(&vfsmount_lock); - return retval; + return buffer ? retval : NULL; global_root: + /* + * We traversed the tree upward and reached a root, but the given + * lookup terminal point wasn't encountered. It means either that the + * dentry is out of our scope or belongs to an abstract space like + * sock_mnt or pipe_mnt. Check for it. + * + * There are different options to check it. + * We may assume that any dentry tree is unreachable unless it's + * connected to `root' (defined as fs root of init aka child reaper) + * and expose all paths that are not connected to it. + * The other option is to allow exposing of known abstract spaces + * explicitly and hide the path information for other cases. + * This approach is more safe, let's take it. 2001/04/22 SAW + */ + if (!(oldmnt->mnt_sb->s_flags & MS_NOUSER)) { + retval = ERR_PTR(-EINVAL); + goto out_err; + } + retval += 1; /* hit the slash */ - if (prepend_name(&retval, &buflen, &dentry->d_name) != 0) + if (buffer && prepend_name(&retval, &buflen, &dentry->d_name) != 0) goto Elong; root->mnt = vfsmnt; root->dentry = dentry; @@ -1961,8 +2073,12 @@ global_root: Elong: retval = ERR_PTR(-ENAMETOOLONG); - goto out; +out_err: + spin_unlock(&vfsmount_lock); + return retval; + } +EXPORT_SYMBOL(__d_path); /** * d_path - return the path of a dentry @@ -1992,8 +2108,11 @@ char *d_path(const struct path *path, ch * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: + * + * pipefs and socketfs methods assume valid buffer, d_root_check() + * supplies NULL one for access checks. */ - if (path->dentry->d_op && path->dentry->d_op->d_dname) + if (buf && path->dentry->d_op && path->dentry->d_op->d_dname) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); read_lock(¤t->fs->lock); @@ -2008,6 +2127,231 @@ char *d_path(const struct path *path, ch return res; } +#ifdef CONFIG_VE +#include +#include +#include +#include +#include + +static void mark_sub_tree_virtual(struct dentry *d) +{ + struct dentry *orig_root; + + orig_root = d; + while (1) { + spin_lock(&d->d_lock); + d->d_flags |= DCACHE_VIRTUAL; + spin_unlock(&d->d_lock); + + if (!list_empty(&d->d_subdirs)) { + d = list_entry(d->d_subdirs.next, + struct dentry, d_u.d_child); + continue; + } + if (d == orig_root) + break; + while (d == list_entry(d->d_parent->d_subdirs.prev, + struct dentry, d_u.d_child)) { + d = d->d_parent; + if (d == orig_root) + goto out; + } + d = list_entry(d->d_u.d_child.next, + struct dentry, d_u.d_child); + } +out: + return; +} + +void mark_tree_virtual(struct path *path) +{ + struct vfsmount *orig_rootmnt; + struct vfsmount *m = path->mnt; + struct dentry *d = path->dentry; + + spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); + orig_rootmnt = m; + while (1) { + mark_sub_tree_virtual(d); + if (!list_empty(&m->mnt_mounts)) { + m = list_entry(m->mnt_mounts.next, + struct vfsmount, mnt_child); + d = m->mnt_root; + continue; + } + if (m == orig_rootmnt) + break; + while (m == list_entry(m->mnt_parent->mnt_mounts.prev, + struct vfsmount, mnt_child)) { + m = m->mnt_parent; + if (m == orig_rootmnt) + goto out; + } + m = list_entry(m->mnt_child.next, + struct vfsmount, mnt_child); + d = m->mnt_root; + } +out: + spin_unlock(&vfsmount_lock); + spin_unlock(&dcache_lock); +} +EXPORT_SYMBOL(mark_tree_virtual); + +static struct vz_rate_info area_ri = { 20, 10*HZ }; +#define VE_AREA_ACC_CHECK 0x0001 +#define VE_AREA_ACC_DENY 0x0002 +#define VE_AREA_EXEC_CHECK 0x0010 +#define VE_AREA_EXEC_DENY 0x0020 +#define VE0_AREA_ACC_CHECK 0x0100 +#define VE0_AREA_ACC_DENY 0x0200 +#define VE0_AREA_EXEC_CHECK 0x1000 +#define VE0_AREA_EXEC_DENY 0x2000 +int ve_area_access_check = 0; + +static void print_connection_info(struct task_struct *tsk) +{ + struct files_struct *files; + struct fdtable *fdt; + int fd; + + files = get_files_struct(tsk); + if (!files) + return; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + struct file *file; + struct inode *inode; + struct socket *socket; + struct sock *sk; + struct inet_sock *inet; + + file = fdt->fd[fd]; + if (file == NULL) + continue; + + inode = file->f_dentry->d_inode; + if (!S_ISSOCK(inode->i_mode)) + continue; + + socket = SOCKET_I(inode); + if (socket == NULL) + continue; + + sk = socket->sk; + if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + || sk->sk_type != SOCK_STREAM) + continue; + + inet = inet_sk(sk); + printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n", + NIPQUAD(inet->daddr), ntohs(inet->dport), + inet->num); + } + spin_unlock(&files->file_lock); + put_files_struct(files); +} + +static void check_alert(struct path *path, char *str) +{ + struct task_struct *tsk; + unsigned long page; + struct super_block *sb; + char *p; + + if (!vz_ratelimit(&area_ri)) + return; + + tsk = current; + p = ERR_PTR(-ENOMEM); + page = __get_free_page(GFP_KERNEL); + if (page) { + spin_lock(&dcache_lock); + p = __d_path(path, &tsk->fs->root, (char *)page, PAGE_SIZE); + spin_unlock(&dcache_lock); + } + if (IS_ERR(p)) + p = "(undefined)"; + + sb = path->dentry->d_sb; + printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n" + "Task %d/%d[%s] from VE%d, execenv %d\n", + str, p, sb->s_type->owner_env->veid, + sb->s_type->name, sb->s_dev, + tsk->pid, task_pid_vnr(tsk), tsk->comm, + VE_TASK_INFO(tsk)->owner_env->veid, + get_exec_env()->veid); + + free_page(page); + + print_connection_info(tsk); + + read_lock(&tasklist_lock); + tsk = tsk->parent; + get_task_struct(tsk); + read_unlock(&tasklist_lock); + + printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n", + tsk->pid, task_pid_vnr(tsk), tsk->comm, + VE_TASK_INFO(tsk)->owner_env->veid); + + print_connection_info(tsk); + put_task_struct(tsk); + dump_stack(); +} +#endif + +int check_area_access_ve(struct path *path) +{ +#ifdef CONFIG_VE + int check, alert, deny; + + if (ve_is_super(get_exec_env())) { + check = ve_area_access_check & VE0_AREA_ACC_CHECK; + alert = path->dentry->d_flags & DCACHE_VIRTUAL; + deny = ve_area_access_check & VE0_AREA_ACC_DENY; + } else { + check = ve_area_access_check & VE_AREA_ACC_CHECK; + alert = !(path->dentry->d_flags & DCACHE_VIRTUAL); + deny = ve_area_access_check & VE_AREA_ACC_DENY; + } + + if (check && alert) + check_alert(path, "Access"); + if (deny && alert) + return -EACCES; +#endif + return 0; +} + +#if 0 +int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt) +{ +#ifdef CONFIG_VE + int check, alert, deny; + + if (ve_is_super(get_exec_env())) { + check = ve_area_access_check & VE0_AREA_EXEC_CHECK; + alert = dentry->d_flags & DCACHE_VIRTUAL; + deny = ve_area_access_check & VE0_AREA_EXEC_DENY; + } else { + check = ve_area_access_check & VE_AREA_EXEC_CHECK; + alert = !(dentry->d_flags & DCACHE_VIRTUAL); + deny = ve_area_access_check & VE_AREA_EXEC_DENY; + } + + if (check && alert) + check_alert(mnt, dentry, "Exec"); + if (deny && alert) + return -EACCES; +#endif + return 0; +} +#endif + /* * Helper function for dentry_operations.d_dname() members */ @@ -2193,10 +2537,12 @@ resume: goto repeat; } atomic_dec(&dentry->d_count); + ub_dentry_uncharge_locked(dentry); } if (this_parent != root) { next = this_parent->d_u.d_child.next; atomic_dec(&this_parent->d_count); + ub_dentry_uncharge_locked(this_parent); this_parent = this_parent->d_parent; goto resume; } diff -urNp linux-2.6.32.48/fs/debugfs/inode.c linux-2.6.32.48-openvz/fs/debugfs/inode.c --- linux-2.6.32.48/fs/debugfs/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/debugfs/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -228,9 +228,12 @@ struct dentry *debugfs_create_file(const { struct dentry *dentry = NULL; int error; + struct user_beancounter *ub; pr_debug("debugfs: creating file '%s'\n",name); + ub = set_exec_ub(get_ub0()); + error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) @@ -244,6 +247,7 @@ struct dentry *debugfs_create_file(const goto exit; } exit: + set_exec_ub(ub); return dentry; } EXPORT_SYMBOL_GPL(debugfs_create_file); diff -urNp linux-2.6.32.48/fs/devpts/inode.c linux-2.6.32.48-openvz/fs/devpts/inode.c --- linux-2.6.32.48/fs/devpts/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/devpts/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -38,7 +38,9 @@ extern int pty_limit; /* Config limit on Unix98 ptys */ static DEFINE_MUTEX(allocated_ptys_lock); +#ifndef CONFIG_VE static struct vfsmount *devpts_mnt; +#endif struct pts_mount_opts { int setuid; @@ -83,7 +85,7 @@ static inline struct super_block *pts_sb if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) return inode->i_sb; #endif - return devpts_mnt->mnt_sb; + return get_exec_env()->devpts_mnt->mnt_sb; } #define PARSE_MOUNT 0 @@ -421,11 +423,12 @@ static void devpts_kill_sb(struct super_ kill_litter_super(sb); } -static struct file_system_type devpts_fs_type = { +struct file_system_type devpts_fs_type = { .name = "devpts", .get_sb = devpts_get_sb, .kill_sb = devpts_kill_sb, }; +EXPORT_SYMBOL(devpts_fs_type); /* * The normal naming convention is simply /dev/pts/; this conforms @@ -566,9 +569,9 @@ static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); if (!err) { - devpts_mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + get_ve0()->devpts_mnt = kern_mount(&devpts_fs_type); + if (IS_ERR(get_ve0()->devpts_mnt)) { + err = PTR_ERR(get_ve0()->devpts_mnt); unregister_filesystem(&devpts_fs_type); } } diff -urNp linux-2.6.32.48/fs/direct-io.c linux-2.6.32.48-openvz/fs/direct-io.c --- linux-2.6.32.48/fs/direct-io.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/direct-io.c 2011-11-21 17:40:45.000000000 -0500 @@ -658,7 +658,7 @@ submit_page_section(struct dio *dio, str /* * Read accounting is performed in submit_bio() */ - task_io_account_write(len); + task_io_account_write(page, len, 1); } /* diff -urNp linux-2.6.32.48/fs/eventpoll.c linux-2.6.32.48-openvz/fs/eventpoll.c --- linux-2.6.32.48/fs/eventpoll.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/eventpoll.c 2011-11-21 17:40:45.000000000 -0500 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -102,11 +103,6 @@ #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) -struct epoll_filefd { - struct file *file; - int fd; -}; - /* * Structure used to track possible nested calls, for too deep recursions * and loop cycles. @@ -126,82 +122,6 @@ struct nested_calls { spinlock_t lock; }; -/* - * Each file descriptor added to the eventpoll interface will - * have an entry of this type linked to the "rbr" RB tree. - */ -struct epitem { - /* RB tree node used to link this structure to the eventpoll RB tree */ - struct rb_node rbn; - - /* List header used to link this structure to the eventpoll ready list */ - struct list_head rdllink; - - /* - * Works together "struct eventpoll"->ovflist in keeping the - * single linked chain of items. - */ - struct epitem *next; - - /* The file descriptor information this item refers to */ - struct epoll_filefd ffd; - - /* Number of active wait queue attached to poll operations */ - int nwait; - - /* List containing poll wait queues */ - struct list_head pwqlist; - - /* The "container" of this item */ - struct eventpoll *ep; - - /* List header used to link this item to the "struct file" items list */ - struct list_head fllink; - - /* The structure that describe the interested events and the source fd */ - struct epoll_event event; -}; - -/* - * This structure is stored inside the "private_data" member of the file - * structure and rapresent the main data sructure for the eventpoll - * interface. - */ -struct eventpoll { - /* Protect the this structure access */ - spinlock_t lock; - - /* - * This mutex is used to ensure that files are not removed - * while epoll is using them. This is held during the event - * collection loop, the file cleanup path, the epoll file exit - * code and the ctl operations. - */ - struct mutex mtx; - - /* Wait queue used by sys_epoll_wait() */ - wait_queue_head_t wq; - - /* Wait queue used by file->poll() */ - wait_queue_head_t poll_wait; - - /* List of ready file descriptors */ - struct list_head rdllist; - - /* RB tree root used to store monitored fd structs */ - struct rb_root rbr; - - /* - * This is a single linked list that chains all the "struct epitem" that - * happened while transfering ready events to userspace w/out - * holding ->lock. - */ - struct epitem *ovflist; - - /* The user that created the eventpoll descriptor */ - struct user_struct *user; -}; - /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ @@ -241,7 +161,8 @@ static int max_user_watches __read_mostl /* * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ -static DEFINE_MUTEX(epmutex); +DEFINE_MUTEX(epmutex); +EXPORT_SYMBOL_GPL(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; @@ -693,10 +614,11 @@ static unsigned int ep_eventpoll_poll(st } /* File callbacks that implement the eventpoll file behaviour */ -static const struct file_operations eventpoll_fops = { +const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll }; +EXPORT_SYMBOL(eventpoll_fops); /* Fast test to see if the file is an evenpoll file */ static inline int is_file_epoll(struct file *f) @@ -778,7 +700,7 @@ free_uid: * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ -static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) +struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; @@ -801,6 +723,7 @@ static struct epitem *ep_find(struct eve return epir; } +EXPORT_SYMBOL_GPL(ep_find); /* * This is the callback that is passed to the wait queue wakeup @@ -916,7 +839,7 @@ static void ep_rbtree_insert(struct even /* * Must be called with "mtx" held. */ -static int ep_insert(struct eventpoll *ep, struct epoll_event *event, +int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; @@ -1015,6 +938,7 @@ error_unregister: return error; } +EXPORT_SYMBOL(ep_insert); /* * Modify the interest event mask by dropping an event if the new mask @@ -1297,6 +1221,7 @@ SYSCALL_DEFINE1(epoll_create, int, size) return sys_epoll_create1(0); } +EXPORT_SYMBOL(sys_epoll_create); /* * The following function implements the controller interface for diff -urNp linux-2.6.32.48/fs/exec.c linux-2.6.32.48-openvz/fs/exec.c --- linux-2.6.32.48/fs/exec.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/exec.c 2011-11-21 17:40:45.000000000 -0500 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,8 @@ #include #include "internal.h" +#include + int core_uses_pid; char core_pattern[CORENAME_MAX_SIZE] = "core"; unsigned int core_pipe_limit; @@ -69,6 +72,8 @@ int suid_dumpable = 0; /* The maximal length of core_pattern is also specified in sysctl.c */ +int sysctl_at_vsyscall; + static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); @@ -247,9 +252,14 @@ static int __bprm_mm_init(struct linux_b struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; + err = -ENOMEM; + if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, + NULL, UB_SOFT)) + goto err_charge; + bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) - return -ENOMEM; + goto err_alloc; down_write(&mm->mmap_sem); vma->vm_mm = mm; @@ -281,6 +291,9 @@ err: up_write(&mm->mmap_sem); bprm->vma = NULL; kmem_cache_free(vm_area_cachep, vma); +err_alloc: + ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL); +err_charge: return err; } @@ -751,10 +764,11 @@ int kernel_read(struct file *file, loff_ EXPORT_SYMBOL(kernel_read); -static int exec_mmap(struct mm_struct *mm) +static int exec_mmap(struct linux_binprm *bprm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm, *mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -774,6 +788,10 @@ static int exec_mmap(struct mm_struct *m return -EINTR; } } + + ret = 0; + mm = bprm->mm; + mm->vps_dumpable = 1; task_lock(tsk); active_mm = tsk->active_mm; tsk->mm = mm; @@ -781,15 +799,25 @@ static int exec_mmap(struct mm_struct *m activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); + bprm->mm = NULL; /* We're using it now */ + +#ifdef CONFIG_VZ_GENCALLS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXECMMAP, + bprm) & NOTIFY_FAIL) { + /* similar to binfmt_elf */ + send_sig(SIGKILL, current, 0); + ret = -ENOMEM; + } +#endif if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); mm_update_next_owner(old_mm); mmput(old_mm); - return 0; + return ret; } mmdrop(active_mm); - return 0; + return ret; } /* @@ -884,6 +912,10 @@ static int de_thread(struct task_struct transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); +#ifdef CONFIG_VE + list_replace_rcu(&leader->ve_task_info.vetask_list, + &tsk->ve_task_info.vetask_list); +#endif tsk->group_leader = tsk; leader->group_leader = tsk; @@ -1353,6 +1385,10 @@ int do_execve(char * filename, bool clear_in_exec; int retval; + retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); + if (retval) + return retval; + retval = unshare_files(&displaced); if (retval) goto out_ret; @@ -1606,7 +1642,7 @@ static int zap_process(struct task_struc signal_wake_up(t, 1); nr++; } - } while_each_thread(start, t); + } while_each_thread_ve(start, t); return nr; } @@ -1661,7 +1697,7 @@ static inline int zap_threads(struct tas * next_thread(). */ rcu_read_lock(); - for_each_process(g) { + for_each_process_ve(g) { if (g == tsk->group_leader) continue; if (g->flags & PF_KTHREAD) @@ -1676,7 +1712,7 @@ static inline int zap_threads(struct tas } break; } - } while_each_thread(g, p); + } while_each_thread_ve(g, p); } rcu_read_unlock(); done: @@ -1844,7 +1880,7 @@ void do_coredump(long signr, int exit_co /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_state || !get_dumpable(mm)) { + if (mm->core_state || !get_dumpable(mm) || mm->vps_dumpable != 1) { up_write(&mm->mmap_sem); put_cred(cred); goto fail; diff -urNp linux-2.6.32.48/fs/ext2/namei.c linux-2.6.32.48-openvz/fs/ext2/namei.c --- linux-2.6.32.48/fs/ext2/namei.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ext2/namei.c 2011-11-21 17:40:45.000000000 -0500 @@ -31,6 +31,7 @@ */ #include +#include #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -262,6 +263,8 @@ static int ext2_unlink(struct inode * di struct page * page; int err = -ENOENT; + vfs_dq_init(inode); + de = ext2_find_entry (dir, &dentry->d_name, &page); if (!de) goto out; @@ -304,6 +307,9 @@ static int ext2_rename (struct inode * o struct ext2_dir_entry_2 * old_de; int err = -ENOENT; + if (new_inode) + vfs_dq_init(new_inode); + old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff -urNp linux-2.6.32.48/fs/ext2/super.c linux-2.6.32.48-openvz/fs/ext2/super.c --- linux-2.6.32.48/fs/ext2/super.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ext2/super.c 2011-11-21 17:40:45.000000000 -0500 @@ -1426,7 +1426,7 @@ static struct file_system_type ext2_fs_t .name = "ext2", .get_sb = ext2_get_sb, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, }; static int __init init_ext2_fs(void) diff -urNp linux-2.6.32.48/fs/ext3/ioctl.c linux-2.6.32.48-openvz/fs/ext3/ioctl.c --- linux-2.6.32.48/fs/ext3/ioctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ext3/ioctl.c 2011-11-21 17:40:45.000000000 -0500 @@ -78,7 +78,7 @@ long ext3_ioctl(struct file *filp, unsig * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) + if (!capable(CAP_SYS_ADMIN)) goto flags_out; } diff -urNp linux-2.6.32.48/fs/ext3/namei.c linux-2.6.32.48-openvz/fs/ext3/namei.c --- linux-2.6.32.48/fs/ext3/namei.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ext3/namei.c 2011-11-21 17:40:45.000000000 -0500 @@ -1340,7 +1340,7 @@ static int add_dirent_to_buf(handle_t *h if (err) ext3_std_error(dir->i_sb, err); brelse(bh); - return 0; + return err; } /* diff -urNp linux-2.6.32.48/fs/ext3/super.c linux-2.6.32.48-openvz/fs/ext3/super.c --- linux-2.6.32.48/fs/ext3/super.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ext3/super.c 2011-11-21 17:40:45.000000000 -0500 @@ -2993,7 +2993,7 @@ static struct file_system_type ext3_fs_t .name = "ext3", .get_sb = ext3_get_sb, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, }; static int __init init_ext3_fs(void) diff -urNp linux-2.6.32.48/fs/ext4/inode.c linux-2.6.32.48-openvz/fs/ext4/inode.c --- linux-2.6.32.48/fs/ext4/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ext4/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -5845,9 +5845,14 @@ int ext4_page_mkwrite(struct vm_area_str int ret = -EINVAL; void *fsdata; struct file *file = vma->vm_file; - struct inode *inode = file->f_path.dentry->d_inode; - struct address_space *mapping = inode->i_mapping; + struct inode *inode; + struct address_space *mapping; + if (file->f_op->get_host) + file = file->f_op->get_host(file); + + inode = file->f_path.dentry->d_inode; + mapping = inode->i_mapping; /* * Get i_alloc_sem to stop truncates messing with the inode. We cannot * get i_mutex because we are already holding mmap_sem. diff -urNp linux-2.6.32.48/fs/ext4/ioctl.c linux-2.6.32.48-openvz/fs/ext4/ioctl.c --- linux-2.6.32.48/fs/ext4/ioctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ext4/ioctl.c 2011-11-21 17:40:45.000000000 -0500 @@ -77,7 +77,7 @@ long ext4_ioctl(struct file *filp, unsig * the relevant capability. */ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) + if (!capable(CAP_SYS_ADMIN)) goto flags_out; } if (oldflags & EXT4_EXTENTS_FL) { diff -urNp linux-2.6.32.48/fs/ext4/super.c linux-2.6.32.48-openvz/fs/ext4/super.c --- linux-2.6.32.48/fs/ext4/super.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ext4/super.c 2011-11-21 17:40:45.000000000 -0500 @@ -2462,10 +2462,10 @@ static int ext4_fill_super(struct super_ set_opt(sbi->s_mount_opt, BARRIER); /* - * enable delayed allocation by default - * Use -o nodelalloc to turn it off + * Don't enable delayed allocation by default + * Use -o delalloc to turn it on */ - set_opt(sbi->s_mount_opt, DELALLOC); + /* set_opt(sbi->s_mount_opt, DELALLOC); */ if (!parse_options((char *) data, sb, &journal_devnum, &journal_ioprio, NULL, 0)) diff -urNp linux-2.6.32.48/fs/fcntl.c linux-2.6.32.48-openvz/fs/fcntl.c --- linux-2.6.32.48/fs/fcntl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/fcntl.c 2011-11-21 17:40:45.000000000 -0500 @@ -126,6 +126,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldf } return sys_dup3(oldfd, newfd, 0); } +EXPORT_SYMBOL_GPL(sys_dup2); SYSCALL_DEFINE1(dup, unsigned int, fildes) { @@ -149,6 +150,9 @@ static int setfl(int fd, struct file * f struct inode * inode = filp->f_path.dentry->d_inode; int error = 0; + if (!capable(CAP_SYS_RAWIO) && !odirect_enable) + arg &= ~O_DIRECT; + /* * O_APPEND cannot be cleared if the file is marked as append-only * and the file is open for write. @@ -742,7 +746,7 @@ EXPORT_SYMBOL(kill_fasync); static int __init fasync_init(void) { fasync_cache = kmem_cache_create("fasync_cache", - sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); + sizeof(struct fasync_struct), 0, SLAB_PANIC|SLAB_UBC, NULL); return 0; } diff -urNp linux-2.6.32.48/fs/file.c linux-2.6.32.48-openvz/fs/file.c --- linux-2.6.32.48/fs/file.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/file.c 2011-11-21 17:40:45.000000000 -0500 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,8 @@ #include #include +#include + struct fdtable_defer { spinlock_t lock; struct work_struct wq; @@ -42,9 +45,9 @@ static DEFINE_PER_CPU(struct fdtable_def static inline void * alloc_fdmem(unsigned int size) { if (size <= PAGE_SIZE) - return kmalloc(size, GFP_KERNEL); + return kmalloc(size, GFP_KERNEL_UBC); else - return vmalloc(size); + return ub_vmalloc(size); } static inline void free_fdarr(struct fdtable *fdt) @@ -163,7 +166,7 @@ static struct fdtable * alloc_fdtable(un if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_UBC); if (!fdt) goto out; fdt->max_fds = nr; @@ -198,7 +201,7 @@ out: * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_fdtable(struct files_struct *files, int nr) +int expand_fdtable(struct files_struct *files, int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -238,6 +241,7 @@ static int expand_fdtable(struct files_s } return 1; } +EXPORT_SYMBOL_GPL(expand_fdtable); /* * Expand files. diff -urNp linux-2.6.32.48/fs/filesystems.c linux-2.6.32.48-openvz/fs/filesystems.c --- linux-2.6.32.48/fs/filesystems.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/filesystems.c 2011-11-21 17:40:45.000000000 -0500 @@ -14,6 +14,9 @@ #include #include #include +#include /* for 'current' */ +#include +#include #include /* @@ -23,8 +26,8 @@ * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it + * 2) we hold the reference to the element. + * The latter can be guaranteed by call of try_filesystem(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ @@ -32,24 +35,46 @@ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); +int try_get_filesystem(struct file_system_type *fs) +{ + if (try_module_get(fs->owner)) { + (void)get_ve(fs->owner_env); + return 1; + } + return 0; +} + /* WARNING: This can be used only if we _already_ own a reference */ void get_filesystem(struct file_system_type *fs) { + (void)get_ve(fs->owner_env); __module_get(fs->owner); } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); + put_ve(fs->owner_env); } -static struct file_system_type **find_filesystem(const char *name, unsigned len) +static inline int check_ve_fstype(struct file_system_type *p, + struct ve_struct *env) +{ + return ((p->fs_flags & FS_VIRTUALIZED) || + ve_accessible_strict(p->owner_env, env)); +} + +static struct file_system_type **find_filesystem(const char *name, unsigned len, + struct ve_struct *env) { struct file_system_type **p; - for (p=&file_systems; *p; p=&(*p)->next) + for (p=&file_systems; *p; p=&(*p)->next) { + if (!check_ve_fstype(*p, env)) + continue; if (strlen((*p)->name) == len && strncmp((*p)->name, name, len) == 0) break; + } return p; } @@ -75,8 +100,12 @@ int register_filesystem(struct file_syst if (fs->next) return -EBUSY; INIT_LIST_HEAD(&fs->fs_supers); + if (fs->owner_env == NULL) + fs->owner_env = get_ve0(); + if (fs->proto == NULL) + fs->proto = fs; write_lock(&file_systems_lock); - p = find_filesystem(fs->name, strlen(fs->name)); + p = find_filesystem(fs->name, strlen(fs->name), fs->owner_env); if (*p) res = -EBUSY; else @@ -120,6 +149,75 @@ int unregister_filesystem(struct file_sy EXPORT_SYMBOL(unregister_filesystem); +#ifdef CONFIG_VE +int register_ve_fs_type(struct ve_struct *ve, struct file_system_type *template, + struct file_system_type **p_fs_type, struct vfsmount **p_mnt) +{ + struct vfsmount *mnt; + struct file_system_type *local_fs_type; + int ret; + + local_fs_type = kzalloc(sizeof(*local_fs_type) + sizeof(void *), + GFP_KERNEL); + if (local_fs_type == NULL) + return -ENOMEM; + + local_fs_type->name = template->name; + local_fs_type->fs_flags = template->fs_flags; + local_fs_type->get_sb = template->get_sb; + local_fs_type->kill_sb = template->kill_sb; + local_fs_type->owner = template->owner; + local_fs_type->owner_env = ve; + local_fs_type->proto = template; + + get_filesystem(local_fs_type); /* get_ve() inside */ + + ret = register_filesystem(local_fs_type); + if (ret) + goto reg_err; + + if (p_mnt == NULL) + goto done; + + mnt = vfs_kern_mount(local_fs_type, 0, local_fs_type->name, NULL); + if (IS_ERR(mnt)) + goto mnt_err; + + *p_mnt = mnt; +done: + *p_fs_type = local_fs_type; + return 0; + +mnt_err: + ret = PTR_ERR(mnt); + unregister_filesystem(local_fs_type); /* does not put */ + +reg_err: + put_filesystem(local_fs_type); + kfree(local_fs_type); + printk(KERN_DEBUG + "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret); + return ret; +} + +EXPORT_SYMBOL(register_ve_fs_type); + +void unregister_ve_fs_type(struct file_system_type *local_fs_type, + struct vfsmount *local_fs_mount) +{ + if (local_fs_mount == NULL && local_fs_type == NULL) + return; + + unregister_filesystem(local_fs_type); + umount_ve_fs_type(local_fs_type, -1); + if (local_fs_mount) + kern_umount(local_fs_mount); /* alias to mntput, drop our ref */ + put_filesystem(local_fs_type); +} + +EXPORT_SYMBOL(unregister_ve_fs_type); +#endif + static int fs_index(const char __user * __name) { struct file_system_type * tmp; @@ -133,11 +231,14 @@ static int fs_index(const char __user * err = -EINVAL; read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { + for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) { + if (!check_ve_fstype(tmp, get_exec_env())) + continue; if (strcmp(tmp->name,name) == 0) { err = index; break; } + index++; } read_unlock(&file_systems_lock); putname(name); @@ -150,9 +251,15 @@ static int fs_name(unsigned int index, c int len, res; read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) - if (index <= 0 && try_module_get(tmp->owner)) - break; + for (tmp = file_systems; tmp; tmp = tmp->next) { + if (!check_ve_fstype(tmp, get_exec_env())) + continue; + if (!index) { + if (try_get_filesystem(tmp)) + break; + } else + index--; + } read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; @@ -170,8 +277,9 @@ static int fs_maxindex(void) int index; read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; + for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next) + if (check_ve_fstype(tmp, get_exec_env())) + index++; read_unlock(&file_systems_lock); return index; } @@ -207,9 +315,10 @@ int __init get_filesystem_list(char *buf read_lock(&file_systems_lock); tmp = file_systems; while (tmp && len < PAGE_SIZE - 80) { - len += sprintf(buf+len, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); + if (check_ve_fstype(tmp, get_exec_env())) + len += sprintf(buf+len, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); @@ -224,9 +333,12 @@ static int filesystems_proc_show(struct read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { + if (!check_ve_fstype(tmp, get_exec_env())) + goto next; /* skip in VE */ seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); +next: tmp = tmp->next; } read_unlock(&file_systems_lock); @@ -247,7 +359,7 @@ static const struct file_operations file static int __init proc_filesystems_init(void) { - proc_create("filesystems", 0, NULL, &filesystems_proc_fops); + proc_create("filesystems", 0, &glob_proc_root, &filesystems_proc_fops); return 0; } module_init(proc_filesystems_init); @@ -258,8 +370,8 @@ static struct file_system_type *__get_fs struct file_system_type *fs; read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); - if (fs && !try_module_get(fs->owner)) + fs = *(find_filesystem(name, len, get_exec_env())); + if (fs && !try_get_filesystem(fs)) fs = NULL; read_unlock(&file_systems_lock); return fs; diff -urNp linux-2.6.32.48/fs/file_table.c linux-2.6.32.48-openvz/fs/file_table.c --- linux-2.6.32.48/fs/file_table.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/file_table.c 2011-11-21 17:40:45.000000000 -0500 @@ -22,9 +22,14 @@ #include #include #include +#include #include +#include +#include +#include + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE @@ -34,7 +39,8 @@ struct files_stat_struct files_stat = { __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); /* SLAB cache for file structures */ -static struct kmem_cache *filp_cachep __read_mostly; +struct kmem_cache *filp_cachep __read_mostly; +EXPORT_SYMBOL_GPL(filp_cachep); static struct percpu_counter nr_files __cacheline_aligned_in_smp; @@ -43,13 +49,16 @@ static inline void file_free_rcu(struct struct file *f = container_of(head, struct file, f_u.fu_rcuhead); put_cred(f->f_cred); + put_ve(f->owner_env); kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { - percpu_counter_dec(&nr_files); file_check_state(f); + if (f->f_ub == get_ub0()) + percpu_counter_dec(&nr_files); + ub_file_uncharge(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -103,11 +112,14 @@ struct file *get_empty_filp(void) const struct cred *cred = current_cred(); static int old_max; struct file * f; + int acct; + acct = (get_exec_ub() == get_ub0()); /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (acct && get_nr_files() >= files_stat.max_files && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. @@ -120,7 +132,13 @@ struct file *get_empty_filp(void) if (f == NULL) goto fail; - percpu_counter_inc(&nr_files); + if (ub_file_charge(f)) + goto fail_ch; + if (acct) + percpu_counter_inc(&nr_files); + + f->owner_env = get_ve(get_exec_env()); + f->f_cred = get_cred(cred); if (security_file_alloc(f)) goto fail_sec; @@ -146,6 +164,10 @@ fail_sec: file_free(f); fail: return NULL; + +fail_ch: + kmem_cache_free(filp_cachep, f); + return NULL; } EXPORT_SYMBOL(get_empty_filp); diff -urNp linux-2.6.32.48/fs/fs_struct.c linux-2.6.32.48-openvz/fs/fs_struct.c --- linux-2.6.32.48/fs/fs_struct.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/fs_struct.c 2011-11-21 17:40:45.000000000 -0500 @@ -47,7 +47,7 @@ void chroot_fs_refs(struct path *old_roo int count = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_ve(g, p) { task_lock(p); fs = p->fs; if (fs) { @@ -67,7 +67,7 @@ void chroot_fs_refs(struct path *old_roo write_unlock(&fs->lock); } task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_ve(g, p); read_unlock(&tasklist_lock); while (count--) path_put(old_root); @@ -96,6 +96,7 @@ void exit_fs(struct task_struct *tsk) free_fs_struct(fs); } } +EXPORT_SYMBOL(exit_fs); struct fs_struct *copy_fs_struct(struct fs_struct *old) { diff -urNp linux-2.6.32.48/fs/fuse/control.c linux-2.6.32.48-openvz/fs/fuse/control.c --- linux-2.6.32.48/fs/fuse/control.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/fuse/control.c 2011-11-21 17:40:45.000000000 -0500 @@ -10,6 +10,8 @@ #include #include +#include +#include #define FUSE_CTL_SUPER_MAGIC 0x65735543 @@ -17,7 +19,11 @@ * This is non-NULL when the single instance of the control filesystem * exists. Protected by fuse_mutex */ +#ifdef CONFIG_VE +#define fuse_control_sb (get_exec_env()->_fuse_control_sb) +#else static struct super_block *fuse_control_sb; +#endif static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { @@ -345,12 +351,51 @@ static struct file_system_type fuse_ctl_ .kill_sb = fuse_ctl_kill_sb, }; +#ifdef CONFIG_VE +static int fuse_ctl_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_ctl_fs_type != NULL) + return -EBUSY; + + return register_ve_fs_type(ve, &fuse_ctl_fs_type, + &ve->fuse_ctl_fs_type, NULL); +} + +static void fuse_ctl_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_ctl_fs_type == NULL) + return; + + unregister_ve_fs_type(ve->fuse_ctl_fs_type, NULL); + ve->fuse_ctl_fs_type = NULL; +} + +static struct ve_hook fuse_ctl_ve_hook = { + .init = fuse_ctl_start, + .fini = fuse_ctl_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif + int __init fuse_ctl_init(void) { - return register_filesystem(&fuse_ctl_fs_type); + int err; + + err = register_filesystem(&fuse_ctl_fs_type); + if (err == 0) + ve_hook_register(VE_SS_CHAIN, &fuse_ctl_ve_hook); + return err; } void fuse_ctl_cleanup(void) { + ve_hook_unregister(&fuse_ctl_ve_hook); unregister_filesystem(&fuse_ctl_fs_type); } diff -urNp linux-2.6.32.48/fs/fuse/fuse_i.h linux-2.6.32.48-openvz/fs/fuse/fuse_i.h --- linux-2.6.32.48/fs/fuse/fuse_i.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/fuse/fuse_i.h 2011-11-21 17:40:45.000000000 -0500 @@ -45,7 +45,11 @@ #define FUSE_ALLOW_OTHER (1 << 1) /** List of active connections */ +#ifdef CONFIG_VE +#define fuse_conn_list (get_exec_env()->_fuse_conn_list) +#else extern struct list_head fuse_conn_list; +#endif /** Global mutex protecting fuse_conn_list and the control filesystem */ extern struct mutex fuse_mutex; diff -urNp linux-2.6.32.48/fs/fuse/inode.c linux-2.6.32.48-openvz/fs/fuse/inode.c --- linux-2.6.32.48/fs/fuse/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/fuse/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -20,13 +20,16 @@ #include #include #include +#include MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; +#ifndef CONFIG_VE struct list_head fuse_conn_list; +#endif DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, struct kernel_param *kp); @@ -1194,6 +1197,41 @@ static void fuse_sysfs_cleanup(void) kobject_put(fuse_kobj); } +#ifdef CONFIG_VE +static int fuse_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_fs_type != NULL) + return -EBUSY; + + INIT_LIST_HEAD(&ve->_fuse_conn_list); + return register_ve_fs_type(ve, &fuse_fs_type, &ve->fuse_fs_type, NULL); +} + +static void fuse_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_fs_type == NULL) + return; + + unregister_ve_fs_type(ve->fuse_fs_type, NULL); + kfree(ve->fuse_fs_type); + ve->fuse_fs_type = NULL; + BUG_ON(!list_empty(&ve->_fuse_conn_list)); +} + +static struct ve_hook fuse_ve_hook = { + .init = fuse_start, + .fini = fuse_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif + static int __init fuse_init(void) { int res; @@ -1218,6 +1256,7 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + ve_hook_register(VE_SS_CHAIN, &fuse_ve_hook); sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); @@ -1237,6 +1276,7 @@ static void __exit fuse_exit(void) { printk(KERN_DEBUG "fuse exit\n"); + ve_hook_unregister(&fuse_ve_hook); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); diff -urNp linux-2.6.32.48/fs/inode.c linux-2.6.32.48-openvz/fs/inode.c --- linux-2.6.32.48/fs/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -8,10 +8,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include #include @@ -27,6 +30,7 @@ #include #include #include +#include /* * This is needed for the following functions: @@ -106,7 +110,8 @@ static DECLARE_RWSEM(iprune_sem); */ struct inodes_stat_t inodes_stat; -static struct kmem_cache *inode_cachep __read_mostly; +struct kmem_cache *inode_cachep __read_mostly; + static void wake_up_inode(struct inode *inode) { @@ -125,19 +130,22 @@ static void wake_up_inode(struct inode * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. */ + +static struct address_space_operations vfs_empty_aops; +const struct inode_operations vfs_empty_iops; +static const struct file_operations vfs_empty_fops; +EXPORT_SYMBOL(vfs_empty_iops); + int inode_init_always(struct super_block *sb, struct inode *inode) { - static const struct address_space_operations empty_aops; - static const struct inode_operations empty_iops; - static const struct file_operations empty_fops; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; atomic_set(&inode->i_count, 1); - inode->i_op = &empty_iops; - inode->i_fop = &empty_fops; + inode->i_op = &vfs_empty_iops; + inode->i_fop = &vfs_empty_fops; inode->i_nlink = 1; inode->i_uid = 0; inode->i_gid = 0; @@ -163,15 +171,15 @@ int inode_init_always(struct super_block goto out_free_security; spin_lock_init(&inode->i_lock); - lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); + lockdep_set_class(&inode->i_lock, &sb->s_type->proto->i_lock_key); mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); + lockdep_set_class(&inode->i_mutex, &sb->s_type->proto->i_mutex_key); init_rwsem(&inode->i_alloc_sem); - lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); + lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->proto->i_alloc_sem_key); - mapping->a_ops = &empty_aops; + mapping->a_ops = &vfs_empty_aops; mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); @@ -378,13 +386,76 @@ static void dispose_list(struct list_hea spin_unlock(&inode_lock); } +static void show_header(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + printk("VFS: Busy inodes after unmount. " + "sb = %p, fs type = %s, sb count = %d, " + "sb->s_root = %s\n", sb, + (sb->s_type != NULL) ? sb->s_type->name : "", + sb->s_count, + (sb->s_root != NULL) ? + (char *)sb->s_root->d_name.name : ""); +} + +static void show_inode(struct inode *inode) +{ + struct dentry *d; + struct vfsmount *mnt; + int i; + + printk("inode = %p, inode->i_count = %d, " + "inode->i_nlink = %d, " + "inode->i_mode = %d, " + "inode->i_state = %ld, " + "inode->i_flags = %d, " + "inode->i_devices.next = %p, " + "inode->i_devices.prev = %p, " + "inode->i_ino = %ld\n", + inode, + atomic_read(&inode->i_count), + inode->i_nlink, + inode->i_mode, + inode->i_state, + inode->i_flags, + inode->i_devices.next, + inode->i_devices.prev, + inode->i_ino); + printk("inode dump: "); + for (i = 0; i < sizeof(*inode); i++) + printk("%2.2x ", *((u_char *)inode + i)); + printk("\n"); + list_for_each_entry(d, &inode->i_dentry, d_alias) { + printk(" d_alias %s d_count=%d d_flags=%x\n", + d->d_name.name, atomic_read(&d->d_count), d->d_flags); + for (i = 0; i < sizeof(*d); i++) + printk("%2.2x ", *((u_char *)d + i)); + printk("\n"); + } + + spin_lock(&vfsmount_lock); + list_for_each_entry(mnt, &get_task_mnt_ns(current)->list, mnt_list) { + if (mnt->mnt_sb != inode->i_sb) + continue; + printk("mnt=%p count=%d flags=%x exp_mask=%x\n", + mnt, atomic_read(&mnt->mnt_count), + mnt->mnt_flags, + mnt->mnt_expiry_mark); + for (i = 0; i < sizeof(*mnt); i++) + printk("%2.2x ", *((u_char *)mnt + i)); + printk("\n"); + } + spin_unlock(&vfsmount_lock); +} + /* * Invalidate all inodes for a device. */ -static int invalidate_list(struct list_head *head, struct list_head *dispose) +static int invalidate_list(struct list_head *head, struct list_head *dispose, int check) { struct list_head *next; - int busy = 0, count = 0; + int busy = 0, count = 0, once = 1; next = head->next; for (;;) { @@ -414,6 +485,14 @@ static int invalidate_list(struct list_h continue; } busy = 1; + + if (check) { + if (once) { + once = 0; + show_header(inode); + } + show_inode(inode); + } } /* only unused inodes may be cached with i_count zero */ inodes_stat.nr_unused -= count; @@ -428,7 +507,7 @@ static int invalidate_list(struct list_h * fails because there are busy inodes then a non zero value is returned. * If the discard is successful all the inodes have been discarded. */ -int invalidate_inodes(struct super_block *sb) +int invalidate_inodes_check(struct super_block *sb, int check) { int busy; LIST_HEAD(throw_away); @@ -437,7 +516,7 @@ int invalidate_inodes(struct super_block spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); fsnotify_unmount_inodes(&sb->s_inodes); - busy = invalidate_list(&sb->s_inodes, &throw_away); + busy = invalidate_list(&sb->s_inodes, &throw_away, check); spin_unlock(&inode_lock); dispose_list(&throw_away); @@ -445,7 +524,7 @@ int invalidate_inodes(struct super_block return busy; } -EXPORT_SYMBOL(invalidate_inodes); +EXPORT_SYMBOL(invalidate_inodes_check); static int can_unuse(struct inode *inode) { @@ -536,6 +615,7 @@ static void prune_icache(int nr_to_scan) */ static int shrink_icache_memory(int nr, gfp_t gfp_mask) { + KSTAT_PERF_ENTER(shrink_icache) if (nr) { /* * Nasty deadlock avoidance. We may hold various FS locks, @@ -546,6 +626,7 @@ static int shrink_icache_memory(int nr, return -1; prune_icache(nr); } + KSTAT_PERF_LEAVE(shrink_icache) return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } @@ -703,7 +784,7 @@ void unlock_new_inode(struct inode *inod mutex_destroy(&inode->i_mutex); mutex_init(&inode->i_mutex); lockdep_set_class(&inode->i_mutex, - &type->i_mutex_dir_key); + &type->proto->i_mutex_dir_key); } } #endif @@ -1266,7 +1347,7 @@ int generic_detach_inode(struct inode *i if (!(inode->i_state & (I_DIRTY|I_SYNC))) list_move(&inode->i_list, &inode_unused); inodes_stat.nr_unused++; - if (sb->s_flags & MS_ACTIVE) { + if (sb->s_flags & MS_ACTIVE && !(inode->i_flags & S_NOUNUSE)) { spin_unlock(&inode_lock); return 0; } diff -urNp linux-2.6.32.48/fs/ioprio.c linux-2.6.32.48-openvz/fs/ioprio.c --- linux-2.6.32.48/fs/ioprio.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/ioprio.c 2011-11-21 17:40:45.000000000 -0500 @@ -26,6 +26,8 @@ #include #include #include +#include +#include int set_task_ioprio(struct task_struct *task, int ioprio) { @@ -78,8 +80,11 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int data = IOPRIO_PRIO_DATA(ioprio); struct task_struct *p, *g; struct user_struct *user; - struct pid *pgrp; int ret; + struct pid *pgrp; + + if (!ve_is_super(get_exec_env())) + return -EPERM; switch (class) { case IOPRIO_CLASS_RT: @@ -137,17 +142,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, if (!user) break; - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (__task_cred(p)->uid != who) continue; ret = set_task_ioprio(p, ioprio); if (ret) goto free_uid; - } while_each_thread(g, p); + } while_each_thread_all(g, p); free_uid: if (who) free_uid(user); break; + case IOPRIO_WHO_UBC: + if (class != IOPRIO_CLASS_BE) { + ret = -ERANGE; + break; + } + + ret = ve_set_ioprio(who, data); + break; default: ret = -EINVAL; } @@ -192,9 +205,9 @@ SYSCALL_DEFINE2(ioprio_get, int, which, { struct task_struct *g, *p; struct user_struct *user; - struct pid *pgrp; int ret = -ESRCH; int tmpio; + struct pid *pgrp; read_lock(&tasklist_lock); switch (which) { @@ -230,7 +243,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, if (!user) break; - do_each_thread(g, p) { + do_each_thread_ve(g, p) { if (__task_cred(p)->uid != user->uid) continue; tmpio = get_task_ioprio(p); @@ -240,7 +253,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); + } while_each_thread_ve(g, p); if (who) free_uid(user); diff -urNp linux-2.6.32.48/fs/Kconfig linux-2.6.32.48-openvz/fs/Kconfig --- linux-2.6.32.48/fs/Kconfig 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/Kconfig 2011-11-21 17:40:45.000000000 -0500 @@ -63,6 +63,14 @@ source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" +config SIM_FS + tristate "VPS filesystem" + default m + help + This file system is a part of Virtuozzo. It intoduces a fake + superblock and blockdev to VE to hide real device and show + statfs results taken from quota. + config CUSE tristate "Character device in Userpace support" depends on FUSE_FS diff -urNp linux-2.6.32.48/fs/lockd/clntlock.c linux-2.6.32.48-openvz/fs/lockd/clntlock.c --- linux-2.6.32.48/fs/lockd/clntlock.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/lockd/clntlock.c 2011-11-21 17:40:45.000000000 -0500 @@ -78,8 +78,12 @@ EXPORT_SYMBOL_GPL(nlmclnt_init); */ void nlmclnt_done(struct nlm_host *host) { + struct ve_struct *old_ve; + nlm_release_host(host); + old_ve = set_exec_env(host->owner_env); lockd_down(); + (void)set_exec_env(old_ve); } EXPORT_SYMBOL_GPL(nlmclnt_done); diff -urNp linux-2.6.32.48/fs/lockd/clntproc.c linux-2.6.32.48-openvz/fs/lockd/clntproc.c --- linux-2.6.32.48/fs/lockd/clntproc.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/lockd/clntproc.c 2011-11-21 17:40:45.000000000 -0500 @@ -155,12 +155,15 @@ int nlmclnt_proc(struct nlm_host *host, { struct nlm_rqst *call; int status; + struct ve_struct *ve; nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; + ve = set_exec_env(host->owner_env); + nlmclnt_locks_init_private(fl, host); /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); @@ -182,6 +185,7 @@ int nlmclnt_proc(struct nlm_host *host, unlock_kernel(); dprintk("lockd: clnt proc returns %d\n", status); + (void)set_exec_env(ve); return status; } EXPORT_SYMBOL_GPL(nlmclnt_proc); diff -urNp linux-2.6.32.48/fs/lockd/grace.c linux-2.6.32.48-openvz/fs/lockd/grace.c --- linux-2.6.32.48/fs/lockd/grace.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/lockd/grace.c 2011-11-21 17:40:45.000000000 -0500 @@ -4,9 +4,13 @@ #include #include +#include +#include +#ifndef CONFIG_VE static LIST_HEAD(grace_list); static DEFINE_SPINLOCK(grace_lock); +#endif /** * locks_start_grace @@ -21,9 +25,13 @@ static DEFINE_SPINLOCK(grace_lock); */ void locks_start_grace(struct lock_manager *lm) { +#ifdef CONFIG_VE + atomic_inc(&get_exec_env()->locks_in_grace); +#else spin_lock(&grace_lock); list_add(&lm->list, &grace_list); spin_unlock(&grace_lock); +#endif } EXPORT_SYMBOL_GPL(locks_start_grace); @@ -39,9 +47,13 @@ EXPORT_SYMBOL_GPL(locks_start_grace); */ void locks_end_grace(struct lock_manager *lm) { +#ifdef CONFIG_VE + atomic_dec(&get_exec_env()->locks_in_grace); +#else spin_lock(&grace_lock); list_del_init(&lm->list); spin_unlock(&grace_lock); +#endif } EXPORT_SYMBOL_GPL(locks_end_grace); @@ -54,6 +66,10 @@ EXPORT_SYMBOL_GPL(locks_end_grace); */ int locks_in_grace(void) { +#ifdef CONFIG_VE + return atomic_read(&get_exec_env()->locks_in_grace) != 0; +#else return !list_empty(&grace_list); +#endif } EXPORT_SYMBOL_GPL(locks_in_grace); diff -urNp linux-2.6.32.48/fs/lockd/host.c linux-2.6.32.48-openvz/fs/lockd/host.c --- linux-2.6.32.48/fs/lockd/host.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/lockd/host.c 2011-11-21 17:40:45.000000000 -0500 @@ -30,7 +30,7 @@ static unsigned long next_gc; static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); -static void nlm_gc_hosts(void); +static int nlm_gc_hosts(struct ve_struct *ve); struct nlm_lookup_host_info { const int server; /* search for server|client */ @@ -96,11 +96,13 @@ static struct nlm_host *nlm_lookup_host( struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; + struct ve_struct *ve; + ve = get_exec_env(); mutex_lock(&nlm_host_mutex); if (time_after_eq(jiffies, next_gc)) - nlm_gc_hosts(); + nlm_gc_hosts(ve); /* We may keep several nlm_host objects for a peer, because each * nlm_host is identified by @@ -109,10 +111,13 @@ static struct nlm_host *nlm_lookup_host( * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ + chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { if (!rpc_cmp_addr(nlm_addr(host), ni->sap)) continue; + if (!ve_accessible_strict(host->owner_env, ve)) + continue; /* See if we have an NSM handle for this client */ if (!nsm) @@ -186,6 +191,7 @@ static struct nlm_host *nlm_lookup_host( spin_lock_init(&host->h_lock); INIT_LIST_HEAD(&host->h_granted); INIT_LIST_HEAD(&host->h_reclaim); + host->owner_env = ve; nrhosts++; @@ -493,6 +499,11 @@ nlm_shutdown_hosts(void) struct hlist_head *chain; struct hlist_node *pos; struct nlm_host *host; + int nr_hosts_local; + struct ve_struct *ve; + + ve = get_exec_env(); + nr_hosts_local = 0; dprintk("lockd: shutting down host module\n"); mutex_lock(&nlm_host_mutex); @@ -501,24 +512,29 @@ nlm_shutdown_hosts(void) dprintk("lockd: nuking all hosts...\n"); for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { hlist_for_each_entry(host, pos, chain, h_hash) { + if (!ve_accessible_strict(host->owner_env, ve)) + continue; host->h_expires = jiffies - 1; if (host->h_rpcclnt) { rpc_shutdown_client(host->h_rpcclnt); host->h_rpcclnt = NULL; } + nr_hosts_local++; } } /* Then, perform a garbage collection pass */ - nlm_gc_hosts(); + nr_hosts_local -= nlm_gc_hosts(ve); mutex_unlock(&nlm_host_mutex); /* complain if any hosts are left */ - if (nrhosts) { + if (nr_hosts_local) { printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); - dprintk("lockd: %d hosts left:\n", nrhosts); + dprintk("lockd: %d hosts left:\n", nr_hosts_local); for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { hlist_for_each_entry(host, pos, chain, h_hash) { + if (!ve_accessible_strict(host->owner_env, ve)) + continue; dprintk(" %s (cnt %d use %d exp %ld)\n", host->h_name, atomic_read(&host->h_count), host->h_inuse, host->h_expires); @@ -532,17 +548,23 @@ nlm_shutdown_hosts(void) * This GC combines reference counting for async operations with * mark & sweep for resources held by remote clients. */ -static void -nlm_gc_hosts(void) +static int +nlm_gc_hosts(struct ve_struct *ve) { struct hlist_head *chain; struct hlist_node *pos, *next; struct nlm_host *host; + int freed; + + freed = 0; dprintk("lockd: host garbage collection\n"); for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { - hlist_for_each_entry(host, pos, chain, h_hash) + hlist_for_each_entry(host, pos, chain, h_hash) { + if (!ve_accessible_strict(host->owner_env, ve)) + continue; host->h_inuse = 0; + } } /* Mark all hosts that hold locks, blocks or shares */ @@ -551,7 +573,8 @@ nlm_gc_hosts(void) for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { hlist_for_each_entry_safe(host, pos, next, chain, h_hash) { if (atomic_read(&host->h_count) || host->h_inuse - || time_before(jiffies, host->h_expires)) { + || time_before(jiffies, host->h_expires) + || !ve_accessible_strict(host->owner_env, ve)) { dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n", host->h_name, atomic_read(&host->h_count), host->h_inuse, host->h_expires); @@ -562,8 +585,57 @@ nlm_gc_hosts(void) nlm_destroy_host(host); nrhosts--; + freed++; } } next_gc = jiffies + NLM_HOST_COLLECT; + return freed; +} + +#ifdef CONFIG_VE +void ve_nlm_shutdown_hosts(struct ve_struct *ve) +{ + envid_t veid = ve->veid; + int i; + + dprintk("lockd: shutting down host module for ve %d\n", veid); + mutex_lock(&nlm_host_mutex); + + /* Make sure no async RPC task is in progress */ + down_write(&rpc_async_task_lock); + + /* Perform a garbage collection pass */ + for (i = 0; i < NLM_HOST_NRHASH; i++) { + struct nlm_host *host; + struct hlist_node *pos, *tmp; + + hlist_for_each_entry_safe(host, pos, tmp, &nlm_hosts[i], h_hash) { + struct rpc_clnt *clnt; + + if (ve != host->owner_env) + continue; + + hlist_del(&host->h_hash); + if (host->h_nsmhandle) + host->h_nsmhandle->sm_monitored = 0; + dprintk("lockd: delete host %s ve %d\n", host->h_name, + veid); + if ((clnt = host->h_rpcclnt) != NULL) { + if (!list_empty(&clnt->cl_tasks)) { + printk(KERN_WARNING + "lockd: active RPC handle\n"); + rpc_kill_client(clnt); + } else + rpc_shutdown_client(clnt); + } + kfree(host); + nrhosts--; + } + } + + up_write(&rpc_async_task_lock); + + mutex_unlock(&nlm_host_mutex); } +#endif diff -urNp linux-2.6.32.48/fs/lockd/svc.c linux-2.6.32.48-openvz/fs/lockd/svc.c --- linux-2.6.32.48/fs/lockd/svc.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/lockd/svc.c 2011-11-21 17:40:45.000000000 -0500 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -47,25 +48,29 @@ struct nlmsvc_binding * nlmsvc_ops; EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); -static unsigned int nlmsvc_users; -static struct task_struct *nlmsvc_task; -static struct svc_rqst *nlmsvc_rqst; -unsigned long nlmsvc_timeout; /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 */ -static unsigned long nlm_grace_period; static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; static int nlm_udpport, nlm_tcpport; +#ifndef CONFIG_VE +static unsigned int _nlmsvc_users; +static struct task_struct *_nlmsvc_task; +static struct svc_rqst *_nlmsvc_rqst; +static unsigned long _nlmsvc_grace_period; +unsigned long _nlmsvc_timeout; +#endif + /* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ static unsigned int nlm_max_connections = 1024; /* * Constants needed for the sysctl interface. */ +static unsigned long nlm_grace_period; static const unsigned long nlm_grace_period_min = 0; static const unsigned long nlm_grace_period_max = 240; static const unsigned long nlm_timeout_min = 3; @@ -176,8 +181,9 @@ lockd(void *vrqstp) } if (err < 0) { if (err != preverr) { - printk(KERN_WARNING "%s: unexpected error " - "from svc_recv (%d)\n", __func__, err); + printk(KERN_WARNING "%s: ct%d unexpected error " + "from svc_recv (%d)\n", __func__, + get_exec_env()->veid, err); preverr = err; } schedule_timeout_interruptible(HZ); @@ -280,12 +286,14 @@ int lockd_up(void) */ if (nlmsvc_users) printk(KERN_WARNING - "lockd_up: no pid, %d users??\n", nlmsvc_users); + "lockd_up: ct%d no pid, %d users??\n", + get_exec_env()->veid, nlmsvc_users); error = -ENOMEM; serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); if (!serv) { - printk(KERN_WARNING "lockd_up: create service failed\n"); + printk(KERN_WARNING "lockd_up: ct%d create service failed\n", + get_exec_env()->veid); goto out; } @@ -301,22 +309,23 @@ int lockd_up(void) error = PTR_ERR(nlmsvc_rqst); nlmsvc_rqst = NULL; printk(KERN_WARNING - "lockd_up: svc_rqst allocation failed, error=%d\n", - error); + "lockd_up: ct%d svc_rqst allocation failed, error=%d\n", + get_exec_env()->veid, error); goto destroy_and_out; } svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; - nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); + nlmsvc_task = kthread_run_ve(get_exec_env(), lockd, nlmsvc_rqst, serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); svc_exit_thread(nlmsvc_rqst); nlmsvc_task = NULL; nlmsvc_rqst = NULL; printk(KERN_WARNING - "lockd_up: kthread_run failed, error=%d\n", error); + "lockd_up: ct%d kthread_run failed, error=%d\n", + get_exec_env()->veid, error); goto destroy_and_out; } @@ -345,14 +354,15 @@ lockd_down(void) if (--nlmsvc_users) goto out; } else { - printk(KERN_ERR "lockd_down: no users! task=%p\n", - nlmsvc_task); - BUG(); + printk(KERN_ERR "lockd_down: ct%d no users! task=%p\n", + get_exec_env()->veid, nlmsvc_task); + goto out; } if (!nlmsvc_task) { - printk(KERN_ERR "lockd_down: no lockd running.\n"); - BUG(); + printk(KERN_ERR "lockd_down: ct%d no lockd running.\n", + get_exec_env()->veid); + goto out; } kthread_stop(nlmsvc_task); svc_exit_thread(nlmsvc_rqst); @@ -497,6 +507,29 @@ static int lockd_authenticate(struct svc return SVC_DENIED; } +#ifdef CONFIG_VE +extern void ve_nlm_shutdown_hosts(struct ve_struct *ve); + +static int ve_lockd_start(void *data) +{ + return 0; +} + +static void ve_lockd_stop(void *data) +{ + struct ve_struct *ve = (struct ve_struct *)data; + + ve_nlm_shutdown_hosts(ve); + flush_scheduled_work(); +} + +static struct ve_hook lockd_hook = { + .init = ve_lockd_start, + .fini = ve_lockd_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; +#endif param_set_min_max(port, int, simple_strtol, 0, 65535) param_set_min_max(grace_period, unsigned long, simple_strtoul, @@ -525,16 +558,20 @@ module_param(nlm_max_connections, uint, static int __init init_nlm(void) { + ve_hook_register(VE_SS_CHAIN, &lockd_hook); #ifdef CONFIG_SYSCTL nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); - return nlm_sysctl_table ? 0 : -ENOMEM; -#else - return 0; + if (nlm_sysctl_table == NULL) { + ve_hook_unregister(&lockd_hook); + return -ENOMEM; + } #endif + return 0; } static void __exit exit_nlm(void) { + ve_hook_unregister(&lockd_hook); /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); #ifdef CONFIG_SYSCTL diff -urNp linux-2.6.32.48/fs/lockd/svcsubs.c linux-2.6.32.48-openvz/fs/lockd/svcsubs.c --- linux-2.6.32.48/fs/lockd/svcsubs.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/lockd/svcsubs.c 2011-11-21 17:40:45.000000000 -0500 @@ -334,6 +334,9 @@ nlmsvc_is_client(void *data, struct nlm_ { struct nlm_host *host = data; + if (!ve_accessible_strict(host->owner_env, get_exec_env())) + return 0; + if (host->h_server) { /* we are destroying locks even though the client * hasn't asked us too, so don't unmonitor the diff -urNp linux-2.6.32.48/fs/locks.c linux-2.6.32.48-openvz/fs/locks.c --- linux-2.6.32.48/fs/locks.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/locks.c 2011-11-21 17:40:45.000000000 -0500 @@ -130,6 +130,8 @@ #include +#include + #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) @@ -146,9 +148,25 @@ static LIST_HEAD(blocked_list); static struct kmem_cache *filelock_cache __read_mostly; /* Allocate an empty lock structure. */ -static struct file_lock *locks_alloc_lock(void) +static struct file_lock *locks_alloc_lock(int charge) { - return kmem_cache_alloc(filelock_cache, GFP_KERNEL); + struct file_lock *fl; + + fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + if (fl == NULL) + goto out; + fl->fl_charged = 0; + if (!charge) + goto out; + if (!ub_flock_charge(fl, 1)) + goto out; + + kmem_cache_free(filelock_cache, fl); + fl = NULL; +out: +#endif + return fl; } void locks_release_private(struct file_lock *fl) @@ -174,6 +192,7 @@ static void locks_free_lock(struct file_ BUG_ON(!list_empty(&fl->fl_block)); BUG_ON(!list_empty(&fl->fl_link)); + ub_flock_uncharge(fl); locks_release_private(fl); kmem_cache_free(filelock_cache, fl); } @@ -277,7 +296,7 @@ static int flock_make_lock(struct file * if (type < 0) return type; - fl = locks_alloc_lock(); + fl = locks_alloc_lock(type != F_UNLCK); if (fl == NULL) return -ENOMEM; @@ -464,7 +483,7 @@ static int lease_init(struct file *filp, /* Allocate a file_lock initialised to this type of lease */ static struct file_lock *lease_alloc(struct file *filp, int type) { - struct file_lock *fl = locks_alloc_lock(); + struct file_lock *fl = locks_alloc_lock(1); int error = -ENOMEM; if (fl == NULL) @@ -735,8 +754,13 @@ static int flock_lock_file(struct file * goto find_conflict; if (request->fl_type != F_UNLCK) { + /* + * Nont F_UNLCK request must be already charged in + * flock_make_lock(). Actually new_fl must be charged not the + * request, but we try to fail earlier. + */ error = -ENOMEM; - new_fl = locks_alloc_lock(); + new_fl = locks_alloc_lock(0); if (new_fl == NULL) goto out; error = 0; @@ -788,6 +812,10 @@ find_conflict: } if (request->fl_flags & FL_ACCESS) goto out; + + set_flock_charged(new_fl); + unset_flock_charged(request); + locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); new_fl = NULL; @@ -819,8 +847,11 @@ static int __posix_lock_file(struct inod if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK || request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { - new_fl = locks_alloc_lock(); - new_fl2 = locks_alloc_lock(); + if (request->fl_type != F_UNLCK) + new_fl = locks_alloc_lock(1); + else + new_fl = NULL; + new_fl2 = locks_alloc_lock(0); } lock_kernel(); @@ -954,7 +985,7 @@ static int __posix_lock_file(struct inod * bail out. */ error = -ENOLCK; /* "no luck" */ - if (right && left == right && !new_fl2) + if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2)) goto out; error = 0; @@ -965,23 +996,32 @@ static int __posix_lock_file(struct inod goto out; } - if (!new_fl) { - error = -ENOLCK; + error = -ENOLCK; + if (!new_fl) + goto out; + if (right && (left == right) && ub_flock_charge(new_fl, 1)) goto out; - } locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); new_fl = NULL; + error = 0; } if (right) { if (left == right) { /* The new lock breaks the old one in two pieces, * so we have to use the second new lock. */ + error = -ENOLCK; + if (added && ub_flock_charge(new_fl2, + request->fl_type != F_UNLCK)) + goto out; + /* FIXME move all fl_charged manipulations in ub code */ + set_flock_charged(new_fl2); left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); locks_insert_lock(before, left); + error = 0; } right->fl_start = request->fl_end + 1; locks_wake_up_blocks(right); @@ -1366,7 +1406,7 @@ int generic_setlease(struct file *filp, if (arg != F_UNLCK) { error = -ENOMEM; - new_fl = locks_alloc_lock(); + new_fl = locks_alloc_lock(1); if (new_fl == NULL) goto out; @@ -1610,6 +1650,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, out: return error; } +EXPORT_SYMBOL_GPL(sys_flock); /** * vfs_test_lock - test file byte range lock @@ -1770,7 +1811,7 @@ static int do_lock_file_wait(struct file int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock __user *l) { - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock flock; struct inode *inode; struct file *f; @@ -1888,7 +1929,7 @@ out: int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct flock64 __user *l) { - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock64 flock; struct inode *inode; struct file *f; @@ -2159,6 +2200,8 @@ static int locks_show(struct seq_file *f struct file_lock *fl, *bfl; fl = list_entry(v, struct file_lock, fl_link); + if (!ve_accessible(fl->fl_file->owner_env, get_exec_env())) + goto out; lock_get_status(f, fl, (long)f->private, ""); @@ -2166,6 +2209,7 @@ static int locks_show(struct seq_file *f lock_get_status(f, bfl, (long)f->private, " ->"); f->private++; +out: return 0; } @@ -2207,7 +2251,7 @@ static const struct file_operations proc static int __init proc_locks_init(void) { - proc_create("locks", 0, NULL, &proc_locks_operations); + proc_create("locks", 0, &glob_proc_root, &proc_locks_operations); return 0; } module_init(proc_locks_init); @@ -2294,7 +2338,7 @@ EXPORT_SYMBOL(lock_may_write); static int __init filelock_init(void) { filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, SLAB_PANIC, + sizeof(struct file_lock), 0, SLAB_PANIC|SLAB_UBC, init_once); return 0; } diff -urNp linux-2.6.32.48/fs/Makefile linux-2.6.32.48-openvz/fs/Makefile --- linux-2.6.32.48/fs/Makefile 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/Makefile 2011-11-21 17:40:45.000000000 -0500 @@ -53,6 +53,8 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl obj-y += quota/ +obj-$(CONFIG_SIM_FS) += simfs.o + obj-$(CONFIG_PROC_FS) += proc/ obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ diff -urNp linux-2.6.32.48/fs/namei.c linux-2.6.32.48-openvz/fs/namei.c --- linux-2.6.32.48/fs/namei.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/namei.c 2011-11-21 17:40:45.000000000 -0500 @@ -143,6 +143,7 @@ char * getname(const char __user * filen { char *tmp, *result; + /*ub_dentry_checkup();*/ result = ERR_PTR(-ENOMEM); tmp = __getname(); if (tmp) { @@ -428,6 +429,21 @@ static struct dentry * cached_lookup(str if (!dentry) dentry = d_lookup(parent, name); + /* + * The revalidation rules are simple: + * d_revalidate operation is called when we're about to use a cached + * dentry rather than call d_lookup. + * d_revalidate method may unhash the dentry itself or return FALSE, in + * which case if the dentry can be released d_lookup will be called. + * + * Additionally, by request of NFS people + * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c) + * d_revalidate is called when `/', `.' or `..' are looked up. + * Since re-lookup is impossible on them, we introduce a hack and + * return an error in this case. + * + * 2003/02/19 SAW + */ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) dentry = do_revalidate(dentry, nd); @@ -479,6 +495,7 @@ static struct dentry * real_lookup(struc struct dentry * result; struct inode *dir = parent->d_inode; +repeat: mutex_lock(&dir->i_mutex); /* * First re-do the cached lookup just in case it was created @@ -525,7 +542,7 @@ out_unlock: if (result->d_op && result->d_op->d_revalidate) { result = do_revalidate(result, nd); if (!result) - result = ERR_PTR(-ENOENT); + goto repeat; } return result; } @@ -765,6 +782,12 @@ static __always_inline void follow_dotdo nd->path.mnt == nd->root.mnt) { break; } +#ifdef CONFIG_VE + if (nd->path.dentry == get_exec_env()->root_path.dentry && + nd->path.mnt == get_exec_env()->root_path.mnt) { + break; + } +#endif spin_lock(&dcache_lock); if (nd->path.dentry != nd->path.mnt->mnt_root) { nd->path.dentry = dget(nd->path.dentry->d_parent); @@ -805,6 +828,10 @@ static int do_lookup(struct nameidata *n if (dentry->d_op && dentry->d_op->d_revalidate) goto need_revalidate; done: + if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; __follow_mount(path); @@ -836,6 +863,7 @@ fail: static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) { return inode && unlikely(inode->i_op->follow_link) && + !(lookup_flags & LOOKUP_STRICT) && ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode)); } @@ -853,6 +881,7 @@ static int __link_path_walk(const char * struct inode *inode; int err; unsigned int lookup_flags = nd->flags; + int real_components = 0; while (*name=='/') name++; @@ -921,6 +950,7 @@ static int __link_path_walk(const char * break; } /* This does the actual lookups.. */ + real_components++; err = do_lookup(nd, &this, &next); if (err) break; @@ -931,6 +961,9 @@ static int __link_path_walk(const char * goto out_dput; if (inode->i_op->follow_link) { + err = -ENOENT; + if (lookup_flags & LOOKUP_STRICT) + goto out_dput; err = do_follow_link(&next, nd); if (err) goto return_err; @@ -996,27 +1029,41 @@ lookup_parent: nd->last_type = LAST_NORM; if (this.name[0] != '.') goto return_base; - if (this.len == 1) + if (this.len == 1) { nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') + goto return_reval; + } else if (this.len == 2 && this.name[1] == '.') { nd->last_type = LAST_DOTDOT; - else - goto return_base; + goto return_reval; + } +return_base: + if (!(nd->flags & LOOKUP_NOAREACHECK)) { + err = check_area_access_ve(&nd->path); + if (err) + break; + } + return 0; return_reval: /* * We bypassed the ordinary revalidation routines. * We may need to check the cached dentry for staleness. */ - if (nd->path.dentry && nd->path.dentry->d_sb && + if (!real_components && nd->path.dentry && nd->path.dentry->d_sb && (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { err = -ESTALE; /* Note: we do not d_invalidate() */ if (!nd->path.dentry->d_op->d_revalidate( nd->path.dentry, nd)) + /* + * This lookup is for `/' or `.' or `..'. + * The filesystem unhashed the dentry itself + * inside d_revalidate (otherwise, d_invalidate + * wouldn't succeed). As a special courtesy to + * NFS we return an error. 2003/02/19 SAW + */ break; } -return_base: - return 0; + goto return_base; out_dput: path_put_conditional(&next, nd); break; @@ -2095,6 +2142,7 @@ SYSCALL_DEFINE3(mknod, const char __user { return sys_mknodat(AT_FDCWD, filename, mode, dev); } +EXPORT_SYMBOL_GPL(sys_mknod); int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { @@ -2159,6 +2207,7 @@ SYSCALL_DEFINE2(mkdir, const char __user { return sys_mkdirat(AT_FDCWD, pathname, mode); } +EXPORT_SYMBOL_GPL(sys_mkdir); /* * We try to drop the dentry early: we should have @@ -2186,6 +2235,7 @@ void dentry_unhash(struct dentry *dentry spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); } +EXPORT_SYMBOL(sys_symlink); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -2273,6 +2323,7 @@ SYSCALL_DEFINE1(rmdir, const char __user { return do_rmdir(AT_FDCWD, pathname); } +EXPORT_SYMBOL_GPL(sys_rmdir); int vfs_unlink(struct inode *dir, struct dentry *dentry) { @@ -2380,6 +2431,7 @@ SYSCALL_DEFINE1(unlink, const char __use { return do_unlinkat(AT_FDCWD, pathname); } +EXPORT_SYMBOL_GPL(sys_unlink); int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) { @@ -2550,6 +2602,7 @@ SYSCALL_DEFINE2(link, const char __user { return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +EXPORT_SYMBOL(sys_rename); /* * The worst of all namespace operations - renaming directory. "Perverted" @@ -2661,6 +2714,9 @@ int vfs_rename(struct inode *old_dir, st int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); const char *old_name; + if (vfs_dq_rename(old_dentry->d_inode, old_dir, new_dir)) + return -EXDEV; + if (old_dentry->d_inode == new_dentry->d_inode) return 0; diff -urNp linux-2.6.32.48/fs/namespace.c linux-2.6.32.48-openvz/fs/namespace.c --- linux-2.6.32.48/fs/namespace.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/namespace.c 2011-11-21 17:40:45.000000000 -0500 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include "pnode.h" @@ -39,6 +40,7 @@ /* spinlock for vfsmount related operations, inplace of dcache_lock */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); +EXPORT_SYMBOL(vfsmount_lock); static int event; static DEFINE_IDA(mnt_id_ida); @@ -48,7 +50,8 @@ static int mnt_group_start = 1; static struct list_head *mount_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; -static struct rw_semaphore namespace_sem; +struct rw_semaphore namespace_sem; +EXPORT_SYMBOL_GPL(namespace_sem); /* /sys/fs */ struct kobject *fs_kobj; @@ -136,11 +139,12 @@ struct vfsmount *alloc_vfsmnt(const char goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup(name, GFP_KERNEL_UBC); if (!mnt->mnt_devname) goto out_free_id; } + mnt->owner = VEID(get_exec_env()); atomic_set(&mnt->mnt_count, 1); INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -517,7 +521,7 @@ static void commit_tree(struct vfsmount touch_mnt_namespace(n); } -static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) +struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { @@ -532,6 +536,7 @@ static struct vfsmount *next_mnt(struct } return list_entry(next, struct vfsmount, mnt_child); } +EXPORT_SYMBOL(next_mnt); static struct vfsmount *skip_mnt_tree(struct vfsmount *p) { @@ -629,6 +634,7 @@ repeat: spin_unlock(&vfsmount_lock); acct_auto_close_mnt(mnt); security_sb_umount_close(mnt); + fsnotify_unmount_mnt(mnt); goto repeat; } } @@ -789,15 +795,50 @@ static void show_type(struct seq_file *m } } +static int prepare_mnt_root_mangle(struct path *path, + char **path_buf, char **ret_path) +{ + /* skip FS_NOMOUNT mounts (rootfs) */ + if (path->mnt->mnt_sb->s_flags & MS_NOUSER) + return -EACCES; + + *path_buf = (char *)__get_free_page(GFP_KERNEL); + if (!*path_buf) + return -ENOMEM; + + *ret_path = d_path(path, *path_buf, PAGE_SIZE); + if (IS_ERR(*ret_path)) { + free_page((unsigned long)*path_buf); + /* + * This means that the file position will be incremented, i.e. + * the total number of "invisible" vfsmnt will leak. + */ + return -EACCES; + } + return 0; +} + static int show_vfsmnt(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; + int err; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + char *path_buf, *path; - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); + + if (ve_is_super(get_exec_env()) || + !(mnt->mnt_sb->s_type->fs_flags & FS_MANGLE_PROC)) + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + else { + seq_puts(m, "/dev/"); + mangle(m, mnt->mnt_sb->s_type->name); + } seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); + mangle(m, path); + free_page((unsigned long) path_buf); seq_putc(m, ' '); show_type(m, mnt->mnt_sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); @@ -884,18 +925,27 @@ static int show_vfsstat(struct seq_file { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = 0; + char *path_buf, *path; + int err; + + err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); /* device */ if (mnt->mnt_devname) { seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); + if (ve_is_super(get_exec_env())) + mangle(m, mnt->mnt_devname); + else + mangle(m, mnt->mnt_sb->s_type->name); } else seq_puts(m, "no device"); /* mount point */ seq_puts(m, " mounted on "); - seq_path(m, &mnt_path, " \t\n\\"); + mangle(m, path); + free_page((unsigned long)path_buf); seq_putc(m, ' '); /* file system type */ @@ -1107,6 +1157,39 @@ static int do_umount(struct vfsmount *mn return retval; } +#ifdef CONFIG_VE +void umount_ve_fs_type(struct file_system_type *local_fs_type, int veid) +{ + struct vfsmount *mnt; + struct list_head *p, *q; + LIST_HEAD(kill); + LIST_HEAD(umount_list); + + down_write(&namespace_sem); + spin_lock(&vfsmount_lock); + list_for_each_safe(p, q, ¤t->nsproxy->mnt_ns->list) { + mnt = list_entry(p, struct vfsmount, mnt_list); + if (mnt->mnt_sb->s_type != local_fs_type) + continue; + if (veid >= 0 && mnt->owner != veid) + continue; + list_del(p); + list_add(p, &kill); + } + + while (!list_empty(&kill)) { + LIST_HEAD(kill2); + mnt = list_entry(kill.next, struct vfsmount, mnt_list); + umount_tree(mnt, 1, &kill2); + list_splice(&kill2, &umount_list); + } + spin_unlock(&vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); +} +EXPORT_SYMBOL(umount_ve_fs_type); +#endif + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -1137,7 +1220,7 @@ SYSCALL_DEFINE2(umount, char __user *, n goto dput_and_out; retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) goto dput_and_out; retval = do_umount(path.mnt, flags); @@ -1163,7 +1246,7 @@ SYSCALL_DEFINE1(oldumount, char __user * static int mount_is_safe(struct path *path) { - if (capable(CAP_SYS_ADMIN)) + if (capable(CAP_VE_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet @@ -1432,6 +1515,8 @@ static int do_change_type(struct path *p if (path->dentry != path->mnt->mnt_root) return -EINVAL; + if (!ve_accessible_veid(path->mnt->owner, get_exec_env()->veid)) + return -EPERM; down_write(&namespace_sem); if (type == MS_SHARED) { @@ -1454,7 +1539,7 @@ static int do_change_type(struct path *p * do loopback mount. */ static int do_loopback(struct path *path, char *old_name, - int recurse) + int recurse, int mnt_flags) { struct path old_path; struct vfsmount *mnt = NULL; @@ -1484,6 +1569,7 @@ static int do_loopback(struct path *path if (!mnt) goto out; + mnt->mnt_flags |= mnt_flags; err = graft_tree(mnt, path); if (err) { LIST_HEAD(umount_list); @@ -1527,7 +1613,7 @@ static int do_remount(struct path *path, int err; struct super_block *sb = path->mnt->mnt_sb; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (!check_mnt(path->mnt)) @@ -1536,6 +1622,9 @@ static int do_remount(struct path *path, if (path->dentry != path->mnt->mnt_root) return -EINVAL; + if (!ve_accessible_veid(path->mnt->owner, get_exec_env()->veid)) + return -EPERM; + down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(path->mnt, flags); @@ -1569,7 +1658,7 @@ static int do_move_mount(struct path *pa struct path old_path, parent_path; struct vfsmount *p; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1577,6 +1666,10 @@ static int do_move_mount(struct path *pa if (err) return err; + err = -EPERM; + if (!ve_accessible_veid(old_path.mnt->owner, get_exec_env()->veid)) + goto out_nosem; + down_write(&namespace_sem); while (d_mountpoint(path->dentry) && follow_down(path)) @@ -1634,6 +1727,7 @@ out: up_write(&namespace_sem); if (!err) path_put(&parent_path); +out_nosem: path_put(&old_path); return err; } @@ -1651,7 +1745,7 @@ static int do_new_mount(struct path *pat return -EINVAL; /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; lock_kernel(); @@ -1692,6 +1786,11 @@ int do_add_mount(struct vfsmount *newmnt goto unlock; newmnt->mnt_flags = mnt_flags; + + /* make this before graft_tree reveals mnt_root to the world... */ + if (path->dentry->d_flags & DCACHE_VIRTUAL) + newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL; + if ((err = graft_tree(newmnt, path))) goto unlock; @@ -1966,7 +2065,7 @@ long do_mount(char *dev_name, char *dir_ retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&path, dev_name, flags & MS_REC); + retval = do_loopback(&path, dev_name, flags & MS_REC, mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&path, flags); else if (flags & MS_MOVE) @@ -2129,6 +2228,7 @@ out_dir: out_type: return ret; } +EXPORT_SYMBOL_GPL(sys_mount); /* * pivot_root Semantics: @@ -2288,7 +2388,7 @@ void __init mnt_init(void) init_rwsem(&namespace_sem); mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); diff -urNp linux-2.6.32.48/fs/nfs/client.c linux-2.6.32.48-openvz/fs/nfs/client.c --- linux-2.6.32.48/fs/nfs/client.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/nfs/client.c 2011-11-21 17:40:45.000000000 -0500 @@ -125,6 +125,7 @@ static struct nfs_client *nfs_alloc_clie atomic_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; + clp->owner_env = get_exec_env(); memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); clp->cl_addrlen = cl_init->addrlen; @@ -364,6 +365,7 @@ static int nfs_sockaddr_cmp(const struct struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) { struct nfs_client *clp; + struct ve_struct *ve = get_exec_env(); spin_lock(&nfs_client_lock); list_for_each_entry(clp, &nfs_client_list, cl_share_link) { @@ -378,6 +380,9 @@ struct nfs_client *nfs_find_client(const if (clp->rpc_ops->version != nfsversion) continue; + if (!ve_accessible_strict(clp->owner_env, ve)) + continue; + /* Match only the IP address, not the port number */ if (!nfs_sockaddr_match_ipaddr(addr, clap)) continue; @@ -398,6 +403,7 @@ struct nfs_client *nfs_find_client_next( { struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr; u32 nfsvers = clp->rpc_ops->version; + struct ve_struct *ve = get_exec_env(); spin_lock(&nfs_client_lock); list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) { @@ -411,6 +417,9 @@ struct nfs_client *nfs_find_client_next( if (clp->rpc_ops->version != nfsvers) continue; + if (!ve_accessible_strict(clp->owner_env, ve)) + continue; + /* Match only the IP address, not the port number */ if (!nfs_sockaddr_match_ipaddr(sap, clap)) continue; @@ -431,13 +440,18 @@ static struct nfs_client *nfs_match_clie { struct nfs_client *clp; const struct sockaddr *sap = data->addr; + struct ve_struct *ve; + ve = get_exec_env(); list_for_each_entry(clp, &nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; + if (!ve_accessible_strict(clp->owner_env, ve)) + continue; + /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->rpc_ops) continue; diff -urNp linux-2.6.32.48/fs/nfs/super.c linux-2.6.32.48-openvz/fs/nfs/super.c --- linux-2.6.32.48/fs/nfs/super.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/nfs/super.c 2011-11-21 17:40:45.000000000 -0500 @@ -53,6 +53,9 @@ #include #include #include +#include +#include +#include #include #include @@ -250,7 +253,8 @@ static struct file_system_type nfs_fs_ty .name = "nfs", .get_sb = nfs_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT| + FS_BINARY_MOUNTDATA|FS_VIRTUALIZED, }; struct file_system_type nfs_xdev_fs_type = { @@ -258,7 +262,8 @@ struct file_system_type nfs_xdev_fs_type .name = "nfs", .get_sb = nfs_xdev_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT| + FS_BINARY_MOUNTDATA|FS_VIRTUALIZED, }; static const struct super_operations nfs_sops = { @@ -350,6 +355,60 @@ static struct shrinker acl_shrinker = { .seeks = DEFAULT_SEEKS, }; +#ifdef CONFIG_VE +static int ve_nfs_start(void *data) +{ + return 0; +} + +static void ve_nfs_stop(void *data) +{ + struct ve_struct *ve; + struct super_block *sb; + + flush_scheduled_work(); + + ve = (struct ve_struct *)data; + /* Basically, on a valid stop we can be here iff NFS was mounted + read-only. In such a case client force-stop is not a problem. + If we are here and NFS is read-write, we are in a FORCE stop, so + force the client to stop. + Lock daemon is already dead. + Only superblock client remains. Den */ + + down_write(&rpc_async_task_lock); + + spin_lock(&sb_lock); + list_for_each_entry(sb, &nfs_fs_type.fs_supers, s_instances) { + struct nfs_server *srv; + struct ve_struct *owner_env; + + srv = NFS_SB(sb); + owner_env = srv->client->cl_xprt->owner_env; + + if (ve_accessible_strict(owner_env, ve)) { + rpc_kill_client(srv->client); + rpc_kill_client(srv->client_acl); + } + } + spin_unlock(&sb_lock); + + /* Make sure no async RPC task is in progress */ + up_write(&rpc_async_task_lock); + + umount_ve_fs_type(&nfs_fs_type, ve->veid); + + flush_scheduled_work(); +} + +static struct ve_hook nfs_hook = { + .init = ve_nfs_start, + .fini = ve_nfs_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_POST, +}; +#endif + /* * Register the NFS filesystems */ @@ -370,6 +429,7 @@ int __init register_nfs_fs(void) goto error_2; #endif register_shrinker(&acl_shrinker); + ve_hook_register(VE_INIT_EXIT_CHAIN, &nfs_hook); return 0; #ifdef CONFIG_NFS_V4 @@ -388,6 +448,7 @@ error_0: void __exit unregister_nfs_fs(void) { unregister_shrinker(&acl_shrinker); + ve_hook_unregister(&nfs_hook); #ifdef CONFIG_NFS_V4 unregister_filesystem(&nfs4_fs_type); #endif @@ -1815,6 +1876,11 @@ static int nfs_validate_mount_data(void goto out_v3_not_compiled; #endif /* !CONFIG_NFS_V3 */ + if (!(args->flags & NFS_MOUNT_VER3)) { + printk("NFSv2 is broken and not supported\n"); + return -EPROTONOSUPPORT; + } + return 0; out_no_data: @@ -2109,6 +2175,10 @@ static int nfs_compare_super(struct supe struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb); int mntflags = sb_mntdata->mntflags; + if (!ve_accessible_strict(old->client->cl_xprt->owner_env, + get_exec_env())) + return 0; + if (!nfs_compare_super_address(old, server)) return 0; /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */ @@ -2137,6 +2207,11 @@ static int nfs_get_sb(struct file_system .mntflags = flags, }; int error = -ENOMEM; + struct ve_struct *ve; + + ve = get_exec_env(); + if (!(ve->features & VE_FEATURE_NFS)) + return -ENODEV; data = nfs_alloc_parsed_mount_data(3); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); @@ -2267,6 +2342,11 @@ static int nfs_xdev_get_sb(struct file_s .mntflags = flags, }; int error; + struct ve_struct *ve; + + ve = get_exec_env(); + if (!(ve->features & VE_FEATURE_NFS)) + return -ENODEV; dprintk("--> nfs_xdev_get_sb()\n"); diff -urNp linux-2.6.32.48/fs/notify/inode_mark.c linux-2.6.32.48-openvz/fs/notify/inode_mark.c --- linux-2.6.32.48/fs/notify/inode_mark.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/notify/inode_mark.c 2011-11-21 17:40:45.000000000 -0500 @@ -90,6 +90,7 @@ #include #include #include /* for inode_lock */ +#include #include @@ -262,6 +263,23 @@ void fsnotify_clear_marks_by_inode(struc } } +static void fsnotify_detach_mnt(struct inode *inode) +{ + struct fsnotify_mark_entry *entry; + struct hlist_node *pos; + struct fsnotify_group *group; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) { + spin_lock(&entry->lock); + group = entry->group; + if (group->ops->detach_mnt) + group->ops->detach_mnt(entry); + spin_unlock(&entry->lock); + } + spin_unlock(&inode->i_lock); +} + /* * given a group and inode, find the mark associated with that combination. * if found take a reference to that mark and return it, else return NULL @@ -362,7 +380,7 @@ int fsnotify_add_mark(struct fsnotify_ma * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. * We temporarily drop inode_lock, however, and CAN block. */ -void fsnotify_unmount_inodes(struct list_head *list) +static void fsnotify_unmount(struct list_head *list, struct vfsmount *mnt) { struct inode *inode, *next_i, *need_iput = NULL; @@ -414,13 +432,29 @@ void fsnotify_unmount_inodes(struct list if (need_iput_tmp) iput(need_iput_tmp); - /* for each watch, send FS_UNMOUNT and then remove it */ - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + if (mnt) + fsnotify_detach_mnt(inode); + else { + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); - fsnotify_inode_delete(inode); + fsnotify_inode_delete(inode); + } iput(inode); spin_lock(&inode_lock); } } + +void fsnotify_unmount_inodes(struct list_head *list) +{ + fsnotify_unmount(list, NULL); +} + +void fsnotify_unmount_mnt(struct vfsmount *mnt) +{ + spin_lock(&inode_lock); + fsnotify_unmount(&mnt->mnt_sb->s_inodes, mnt); + spin_unlock(&inode_lock); +} diff -urNp linux-2.6.32.48/fs/notify/inotify/inotify_fsnotify.c linux-2.6.32.48-openvz/fs/notify/inotify/inotify_fsnotify.c --- linux-2.6.32.48/fs/notify/inotify/inotify_fsnotify.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/notify/inotify/inotify_fsnotify.c 2011-11-21 17:40:45.000000000 -0500 @@ -29,6 +29,7 @@ #include /* kmem_* */ #include #include +#include #include "inotify.h" @@ -164,10 +165,25 @@ void inotify_free_event_priv(struct fsno kmem_cache_free(event_priv_cachep, event_priv); } +static void inotify_detach_mnt(struct fsnotify_mark_entry *fe) +{ + struct inotify_inode_mark_entry *e; + + e = container_of(fe, struct inotify_inode_mark_entry, fsn_entry); + if (e->path.dentry) { + dput(e->path.dentry); + e->path.dentry = NULL; + mnt_unpin(e->path.mnt); + mntput(e->path.mnt); + e->path.dentry = NULL; + } +} + const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, .should_send_event = inotify_should_send_event, .free_group_priv = inotify_free_group_priv, .free_event_priv = inotify_free_event_priv, .freeing_mark = inotify_freeing_mark, + .detach_mnt = inotify_detach_mnt, }; diff -urNp linux-2.6.32.48/fs/notify/inotify/inotify.h linux-2.6.32.48-openvz/fs/notify/inotify/inotify.h --- linux-2.6.32.48/fs/notify/inotify/inotify.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/notify/inotify/inotify.h 2011-11-21 17:40:45.000000000 -0500 @@ -13,6 +13,7 @@ struct inotify_inode_mark_entry { /* fsnotify_mark_entry MUST be the first thing */ struct fsnotify_mark_entry fsn_entry; int wd; + struct path path; }; extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, diff -urNp linux-2.6.32.48/fs/notify/inotify/inotify_user.c linux-2.6.32.48-openvz/fs/notify/inotify/inotify_user.c --- linux-2.6.32.48/fs/notify/inotify/inotify_user.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/notify/inotify/inotify_user.c 2011-11-21 17:40:45.000000000 -0500 @@ -40,6 +40,7 @@ #include #include #include +#include #include "inotify.h" @@ -343,7 +344,7 @@ static long inotify_ioctl(struct file *f return ret; } -static const struct file_operations inotify_fops = { +const struct file_operations inotify_fops = { .poll = inotify_poll, .read = inotify_read, .fasync = inotify_fasync, @@ -351,6 +352,7 @@ static const struct file_operations inot .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, }; +EXPORT_SYMBOL(inotify_fops); /* @@ -464,6 +466,12 @@ static void inotify_free_mark(struct fsn { struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry; + if (ientry->path.dentry) { + dput(ientry->path.dentry); + mnt_unpin(ientry->path.mnt); + mntput(ientry->path.mnt); + } + kmem_cache_free(inotify_inode_mark_cachep, ientry); } @@ -530,16 +538,13 @@ static int inotify_update_existing_watch return ret; } -static int inotify_new_watch(struct fsnotify_group *group, - struct inode *inode, - u32 arg) +int __inotify_new_watch(struct fsnotify_group *group, + struct path *path, __u32 mask, int wd) { struct inotify_inode_mark_entry *tmp_ientry; - __u32 mask; + u32 start_wd; int ret; - /* don't allow invalid bits: we don't want flags set */ - mask = inotify_arg_to_mask(arg); if (unlikely(!mask)) return -EINVAL; @@ -550,6 +555,8 @@ static int inotify_new_watch(struct fsno fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); tmp_ientry->fsn_entry.mask = mask; tmp_ientry->wd = -1; + tmp_ientry->path.dentry = NULL; + tmp_ientry->path.mnt = NULL; ret = -ENOSPC; if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) @@ -559,13 +566,16 @@ retry: if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) goto out_err; + if (wd == -1) + start_wd = group->inotify_data.last_wd + 1; + else + start_wd = wd; /* we are putting the mark on the idr, take a reference */ fsnotify_get_mark(&tmp_ientry->fsn_entry); spin_lock(&group->inotify_data.idr_lock); ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, - group->inotify_data.last_wd+1, - &tmp_ientry->wd); + start_wd, &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { /* we didn't get on the idr, drop the idr reference */ @@ -577,8 +587,15 @@ retry: goto out_err; } + if (wd != -1 && tmp_ientry->wd != wd) { + ret = -EBUSY; + fsnotify_put_mark(&tmp_ientry->fsn_entry); + inotify_remove_from_idr(group, tmp_ientry); + goto out_err; + } + /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); + ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, path->dentry->d_inode); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_ientry); @@ -591,6 +608,12 @@ retry: /* increment the number of watches the user has */ atomic_inc(&group->inotify_data.user->inotify_watches); + if (!ve_is_super(get_exec_env())) { + tmp_ientry->path.dentry = dget(path->dentry); + mnt_pin(path->mnt); + tmp_ientry->path.mnt = path->mnt; + } + /* return the watch descriptor for this new entry */ ret = tmp_ientry->wd; @@ -607,17 +630,24 @@ out_err: return ret; } +EXPORT_SYMBOL(__inotify_new_watch); + +static int inotify_new_watch(struct fsnotify_group *group, + struct path *path, u32 arg) +{ + return __inotify_new_watch(group, path, inotify_arg_to_mask(arg), -1); +} -static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +static int inotify_update_watch(struct fsnotify_group *group, struct path *path, u32 arg) { int ret = 0; retry: /* try to update and existing watch with the new arg */ - ret = inotify_update_existing_watch(group, inode, arg); + ret = inotify_update_existing_watch(group, path->dentry->d_inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) - ret = inotify_new_watch(group, inode, arg); + ret = inotify_new_watch(group, path, arg); /* * inotify_new_watch could race with another thread which did an * inotify_new_watch between the update_existing and the add watch @@ -717,12 +747,12 @@ SYSCALL_DEFINE0(inotify_init) { return sys_inotify_init1(0); } +EXPORT_SYMBOL(sys_inotify_init); SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, u32, mask) { struct fsnotify_group *group; - struct inode *inode; struct path path; struct file *filp; int ret, fput_needed; @@ -747,12 +777,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, if (ret) goto fput_and_out; - /* inode held in place by reference to path; group by fget on fd */ - inode = path.dentry->d_inode; group = filp->private_data; /* create/update an inode mark */ - ret = inotify_update_watch(group, inode, mask); + ret = inotify_update_watch(group, &path, mask); if (unlikely(ret)) goto path_put_and_out; diff -urNp linux-2.6.32.48/fs/open.c linux-2.6.32.48-openvz/fs/open.c --- linux-2.6.32.48/fs/open.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/open.c 2011-11-21 17:40:45.000000000 -0500 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -52,7 +53,21 @@ int vfs_statfs(struct dentry *dentry, st EXPORT_SYMBOL(vfs_statfs); -static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) +int faudit_statfs(struct super_block *sb, struct kstatfs *buf) +{ + struct faudit_statfs_arg arg; + + arg.sb = sb; + arg.stat = buf; + + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg) + != NOTIFY_DONE) + return arg.err; + return 0; +} + +static int vfs_statfs_native(struct dentry *dentry, struct vfsmount *mnt, + struct statfs *buf) { struct kstatfs st; int retval; @@ -61,6 +76,10 @@ static int vfs_statfs_native(struct dent if (retval) return retval; + retval = faudit_statfs(mnt->mnt_sb, &st); + if (retval) + return retval; + if (sizeof(*buf) == sizeof(st)) memcpy(buf, &st, sizeof(st)); else { @@ -96,7 +115,8 @@ static int vfs_statfs_native(struct dent return 0; } -static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) +static int vfs_statfs64(struct dentry *dentry, struct vfsmount *mnt, + struct statfs64 *buf) { struct kstatfs st; int retval; @@ -105,6 +125,10 @@ static int vfs_statfs64(struct dentry *d if (retval) return retval; + retval = faudit_statfs(mnt->mnt_sb, &st); + if (retval) + return retval; + if (sizeof(*buf) == sizeof(st)) memcpy(buf, &st, sizeof(st)); else { @@ -131,7 +155,7 @@ SYSCALL_DEFINE2(statfs, const char __use error = user_path(pathname, &path); if (!error) { struct statfs tmp; - error = vfs_statfs_native(path.dentry, &tmp); + error = vfs_statfs_native(path.dentry, path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_put(&path); @@ -149,7 +173,7 @@ SYSCALL_DEFINE3(statfs64, const char __u error = user_path(pathname, &path); if (!error) { struct statfs64 tmp; - error = vfs_statfs64(path.dentry, &tmp); + error = vfs_statfs64(path.dentry, path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_put(&path); @@ -167,7 +191,7 @@ SYSCALL_DEFINE2(fstatfs, unsigned int, f file = fget(fd); if (!file) goto out; - error = vfs_statfs_native(file->f_path.dentry, &tmp); + error = vfs_statfs_native(file->f_path.dentry, file->f_path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -188,7 +212,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, file = fget(fd); if (!file) goto out; - error = vfs_statfs64(file->f_path.dentry, &tmp); + error = vfs_statfs64(file->f_path.dentry, file->f_path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -630,14 +654,20 @@ out: return err; } -SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) +static int do_fchmodat(int dfd, const char __user *filename, mode_t mode, int flag) { struct path path; struct inode *inode; int error; struct iattr newattrs; + int follow; - error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); + error = -EINVAL; + if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; + + follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + error = user_path_at(dfd, filename, follow, &path); if (error) goto out; inode = path.dentry->d_inode; @@ -659,9 +689,19 @@ out: return error; } +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) +{ + return do_fchmodat(dfd, filename, mode, 0); +} + SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode) { - return sys_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode, 0); +} + +SYSCALL_DEFINE2(lchmod, const char __user *, filename, mode_t, mode) +{ + return do_fchmodat(AT_FDCWD, filename, mode, AT_SYMLINK_NOFOLLOW); } static int chown_common(struct dentry * dentry, uid_t user, gid_t group) @@ -707,6 +747,7 @@ out_release: out: return error; } +EXPORT_SYMBOL_GPL(sys_chown); SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) @@ -948,6 +989,7 @@ struct file *nameidata_to_filp(struct na return filp; } +int odirect_enable = 0; /* * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an * error. @@ -972,6 +1014,9 @@ struct file *dentry_open(struct dentry * return ERR_PTR(-EINVAL); } + if (!capable(CAP_SYS_RAWIO) && !odirect_enable) + flags &= ~O_DIRECT; + error = -ENFILE; f = get_empty_filp(); if (f == NULL) { @@ -1062,6 +1107,7 @@ SYSCALL_DEFINE3(open, const char __user asmlinkage_protect(3, ret, filename, flags, mode); return ret; } +EXPORT_SYMBOL_GPL(sys_open); SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, int, mode) diff -urNp linux-2.6.32.48/fs/partitions/check.c linux-2.6.32.48-openvz/fs/partitions/check.c --- linux-2.6.32.48/fs/partitions/check.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/partitions/check.c 2011-11-21 17:40:45.000000000 -0500 @@ -20,6 +20,7 @@ #include #include #include +#include #include "check.h" @@ -132,6 +133,7 @@ char *disk_name(struct gendisk *hd, int return buf; } +EXPORT_SYMBOL(disk_name); const char *bdevname(struct block_device *bdev, char *buf) { @@ -483,14 +485,16 @@ void register_disk(struct gendisk *disk) if (device_add(ddev)) return; -#ifndef CONFIG_SYSFS_DEPRECATED - err = sysfs_create_link(block_depr, &ddev->kobj, - kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; + + if (!sysfs_deprecated) { + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (err) { + device_del(ddev); + return; + } } -#endif + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); @@ -672,8 +676,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); disk->driverfs_dev = NULL; -#ifndef CONFIG_SYSFS_DEPRECATED - sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); -#endif + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); device_del(disk_to_dev(disk)); } diff -urNp linux-2.6.32.48/fs/pipe.c linux-2.6.32.48-openvz/fs/pipe.c --- linux-2.6.32.48/fs/pipe.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/pipe.c 2011-11-21 17:40:45.000000000 -0500 @@ -22,6 +22,8 @@ #include #include +#include + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -526,7 +528,7 @@ redo1: int error, atomic = 1; if (!page) { - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER | __GFP_UBC); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; @@ -875,7 +877,7 @@ struct pipe_inode_info * alloc_pipe_info { struct pipe_inode_info *pipe; - pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_UBC); if (pipe) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; @@ -1090,6 +1092,7 @@ int do_pipe_flags(int *fd, int flags) free_write_pipe(fw); return error; } +EXPORT_SYMBOL_GPL(do_pipe_flags); /* * sys_pipe() is the normal C calling standard for creating diff -urNp linux-2.6.32.48/fs/proc/array.c linux-2.6.32.48-openvz/fs/proc/array.c --- linux-2.6.32.48/fs/proc/array.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/array.c 2011-11-21 17:40:45.000000000 -0500 @@ -83,6 +83,8 @@ #include #include +#include + #include #include #include "internal.h" @@ -154,6 +156,18 @@ static inline const char *get_task_state return *p; } +static int task_virtual_pid(struct task_struct *t) +{ + struct pid *pid; + + pid = task_pid(t); + /* + * this will give wrong result for tasks, + * that failed to enter VE, but that's OK + */ + return pid ? pid->numbers[pid->level].nr : 0; +} + static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { @@ -161,7 +175,7 @@ static inline void task_state(struct seq int g; struct fdtable *fdt = NULL; const struct cred *cred; - pid_t ppid, tpid; + pid_t ppid, tpid, vpid; rcu_read_lock(); ppid = pid_alive(p) ? @@ -172,6 +186,7 @@ static inline void task_state(struct seq if (tracer) tpid = task_pid_nr_ns(tracer, ns); } + vpid = task_virtual_pid(p); cred = get_task_cred(p); seq_printf(m, "State:\t%s\n" @@ -205,6 +220,11 @@ static inline void task_state(struct seq put_cred(cred); seq_printf(m, "\n"); + + seq_printf(m, "envID:\t%d\nVPid:\t%d\n", + p->ve_task_info.owner_env->veid, vpid); + seq_printf(m, "PNState:\t%u\nStopState:\t%u\n", + p->pn_state, p->stopped_state); } static void render_sigset_t(struct seq_file *m, const char *header, @@ -244,10 +264,10 @@ static void collect_sigign_sigcatch(stru } } -static inline void task_sig(struct seq_file *m, struct task_struct *p) +void task_sig(struct seq_file *m, struct task_struct *p) { unsigned long flags; - sigset_t pending, shpending, blocked, ignored, caught; + sigset_t pending, shpending, blocked, ignored, caught, saved; int num_threads = 0; unsigned long qsize = 0; unsigned long qlim = 0; @@ -257,11 +277,13 @@ static inline void task_sig(struct seq_f sigemptyset(&blocked); sigemptyset(&ignored); sigemptyset(&caught); + sigemptyset(&saved); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; blocked = p->blocked; + saved = p->saved_sigmask; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); qsize = atomic_read(&__task_cred(p)->user->sigpending); @@ -278,6 +300,7 @@ static inline void task_sig(struct seq_f render_sigset_t(m, "SigBlk:\t", &blocked); render_sigset_t(m, "SigIgn:\t", &ignored); render_sigset_t(m, "SigCgt:\t", &caught); + render_sigset_t(m, "SigSvd:\t", &saved); } static void render_cap_t(struct seq_file *m, const char *header, @@ -312,6 +335,20 @@ static inline void task_cap(struct seq_f render_cap_t(m, "CapBnd:\t", &cap_bset); } +#ifdef CONFIG_BEANCOUNTERS +static inline void ub_dump_task_info(struct task_struct *tsk, + char *stsk, int ltsk, char *smm, int lmm) +{ + print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk); + task_lock(tsk); + if (tsk->mm) + print_ub_uid(tsk->mm->mm_ub, smm, lmm); + else + strncpy(smm, "N/A", lmm); + task_unlock(tsk); +} +#endif + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -325,6 +362,9 @@ int proc_pid_status(struct seq_file *m, struct pid *pid, struct task_struct *task) { struct mm_struct *mm = get_task_mm(task); +#ifdef CONFIG_BEANCOUNTERS + char tsk_ub_info[64], mm_ub_info[64]; +#endif task_name(m, task); task_state(m, ns, pid, task); @@ -337,6 +377,14 @@ int proc_pid_status(struct seq_file *m, task_cap(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); +#ifdef CONFIG_BEANCOUNTERS + ub_dump_task_info(task, + tsk_ub_info, sizeof(tsk_ub_info), + mm_ub_info, sizeof(mm_ub_info)); + + seq_printf(m, "TaskUB:\t%s\n", tsk_ub_info); + seq_printf(m, "MMUB:\t%s\n", mm_ub_info); +#endif return 0; } @@ -360,6 +408,10 @@ static int do_task_stat(struct seq_file unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; +#ifdef CONFIG_BEANCOUNTERS + char ub_task_info[64]; + char ub_mm_info[64]; +#endif state = *get_task_state(task); vsize = eip = esp = 0; @@ -438,6 +490,7 @@ static int do_task_stat(struct seq_file priority = task_prio(task); nice = task_nice(task); +#ifndef CONFIG_VE /* Temporary variable needed for gcc-2.96 */ /* convert timespec -> nsec*/ start_time = @@ -445,10 +498,25 @@ static int do_task_stat(struct seq_file + task->real_start_time.tv_nsec; /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); +#else + start_time = ve_relative_clock(&task->start_time); +#endif + +#ifdef CONFIG_BEANCOUNTERS + ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info), + ub_mm_info, sizeof(ub_mm_info)); +#endif seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld" +#ifdef CONFIG_VE + " 0 0 0 0 0 0 0 %d %u" +#endif +#ifdef CONFIG_BEANCOUNTERS + " %s %s" +#endif + "\n", pid_nr_ns(pid, ns), tcomm, state, @@ -495,7 +563,16 @@ static int do_task_stat(struct seq_file task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime) +#ifdef CONFIG_VE + , task_pid_vnr(task), + VEID(VE_TASK_INFO(task)->owner_env) +#endif +#ifdef CONFIG_BEANCOUNTERS + , ub_task_info, + ub_mm_info +#endif + ); if (mm) mmput(mm); return 0; diff -urNp linux-2.6.32.48/fs/proc/base.c linux-2.6.32.48-openvz/fs/proc/base.c --- linux-2.6.32.48/fs/proc/base.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/base.c 2011-11-21 17:40:45.000000000 -0500 @@ -49,6 +49,7 @@ #include +#include #include #include #include @@ -156,10 +157,14 @@ static int get_fs_path(struct task_struc fs = task->fs; if (fs) { read_lock(&fs->lock); - *path = root ? fs->root : fs->pwd; - path_get(path); - read_unlock(&fs->lock); result = 0; + if (!root) + result = d_root_check(&fs->pwd); + if (result == 0) { + *path = root ? fs->root : fs->pwd; + path_get(path); + } + read_unlock(&fs->lock); } task_unlock(task); return result; @@ -577,17 +582,31 @@ static int proc_pid_syscall(struct task_ static int proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; - int allowed = 0; + int err; + /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. */ + err = -ENOENT; task = get_proc_task(inode); if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ); + if (ptrace_may_access(task, PTRACE_MODE_READ)) + err = 0; + else + /* + * This clever ptrace_may_attach() may play a trick + * on us. If the task is zombie it will consider this + * task to be not dumpable at all and will deny any + * ptracing in VE. Not a big deal for ptrace(), but + * following the link will fail with the -EACCESS + * reason. Some software is unable to stand such a + * swindle and refuses to work :( + */ + err = (task->mm ? -EACCES : -ENOENT); put_task_struct(task); } - return allowed; + return err; } static int proc_setattr(struct dentry *dentry, struct iattr *attr) @@ -1066,6 +1085,8 @@ static ssize_t oom_adjust_write(struct f if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && oom_adjust != OOM_DISABLE) return -EINVAL; + if (oom_adjust == OOM_DISABLE && !ve_is_super(get_exec_env())) + return -EPERM; task = get_proc_task(file->f_path.dentry->d_inode); if (!task) @@ -1322,6 +1343,7 @@ void set_mm_exe_file(struct mm_struct *m mm->exe_file = new_exe_file; mm->num_exe_file_vmas = 0; } +EXPORT_SYMBOL(set_mm_exe_file); struct file *get_mm_exe_file(struct mm_struct *mm) { @@ -1360,10 +1382,15 @@ static int proc_exe_link(struct inode *i exe_file = get_mm_exe_file(mm); mmput(mm); if (exe_file) { - *exe_path = exe_file->f_path; - path_get(&exe_file->f_path); + int result; + + result = d_root_check(&exe_file->f_path); + if (result == 0) { + *exe_path = exe_file->f_path; + path_get(&exe_file->f_path); + } fput(exe_file); - return 0; + return result; } else return -ENOENT; } @@ -1371,13 +1398,14 @@ static int proc_exe_link(struct inode *i static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - int error = -EACCES; + int error; /* We don't need a base pointer in the /proc filesystem */ path_put(&nd->path); /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) + error = proc_fd_access_allowed(inode); + if (error < 0) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); @@ -1412,12 +1440,13 @@ static int do_proc_readlink(struct path static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { - int error = -EACCES; + int error; struct inode *inode = dentry->d_inode; struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) + error = proc_fd_access_allowed(inode); + if (error < 0) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &path); @@ -1668,6 +1697,7 @@ static int proc_fd_info(struct inode *in struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); + int err = -ENOENT; if (task) { files = get_files_struct(task); @@ -1680,7 +1710,8 @@ static int proc_fd_info(struct inode *in */ spin_lock(&files->file_lock); file = fcheck_files(files, fd); - if (file) { + err = -EACCES; + if (file && !d_root_check(&file->f_path)) { if (path) { *path = file->f_path; path_get(&file->f_path); @@ -1698,7 +1729,7 @@ static int proc_fd_info(struct inode *in spin_unlock(&files->file_lock); put_files_struct(files); } - return -ENOENT; + return err; } static int proc_fd_link(struct inode *inode, struct path *path) @@ -2488,7 +2519,7 @@ static int do_io_accounting(struct task_ struct task_struct *t = task; task_io_accounting_add(&acct, &task->signal->ioac); - while_each_thread(task, t) + while_each_thread_ve(task, t) task_io_accounting_add(&acct, &t->ioac); unlock_task_sighand(task, &flags); @@ -3200,3 +3231,35 @@ static const struct file_operations proc .read = generic_read_dir, .readdir = proc_task_readdir, }; + +/* Check whether dentry belongs to a task that already died */ +int proc_dentry_of_dead_task(struct dentry *dentry) +{ + if (dentry->d_inode->i_fop == &dummy_proc_pid_file_operations) + return 1; + + return (dentry->d_op == &pid_dentry_operations && + proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first == NULL); +} +EXPORT_SYMBOL(proc_dentry_of_dead_task); + +/* Place it here to avoid use vzrst module count */ +static ssize_t dummy_proc_pid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + return -ESRCH; +} + +static ssize_t dummy_proc_pid_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return -ESRCH; +} + +struct file_operations dummy_proc_pid_file_operations = { + .read = dummy_proc_pid_read, + .write = dummy_proc_pid_write, +}; + +EXPORT_SYMBOL(dummy_proc_pid_file_operations); + diff -urNp linux-2.6.32.48/fs/proc/cmdline.c linux-2.6.32.48-openvz/fs/proc/cmdline.c --- linux-2.6.32.48/fs/proc/cmdline.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/cmdline.c 2011-11-21 17:40:45.000000000 -0500 @@ -2,10 +2,12 @@ #include #include #include +#include static int cmdline_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%s\n", saved_command_line); + seq_printf(m, "%s\n", + ve_is_super(get_exec_env()) ? saved_command_line : "quiet"); return 0; } @@ -23,7 +25,7 @@ static const struct file_operations cmdl static int __init proc_cmdline_init(void) { - proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + proc_create("cmdline", 0, &glob_proc_root, &cmdline_proc_fops); return 0; } module_init(proc_cmdline_init); diff -urNp linux-2.6.32.48/fs/proc/cpuinfo.c linux-2.6.32.48-openvz/fs/proc/cpuinfo.c --- linux-2.6.32.48/fs/proc/cpuinfo.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/cpuinfo.c 2011-11-21 17:40:45.000000000 -0500 @@ -18,7 +18,7 @@ static const struct file_operations proc static int __init proc_cpuinfo_init(void) { - proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + proc_create("cpuinfo", 0, &glob_proc_root, &proc_cpuinfo_operations); return 0; } module_init(proc_cpuinfo_init); diff -urNp linux-2.6.32.48/fs/proc/devices.c linux-2.6.32.48-openvz/fs/proc/devices.c --- linux-2.6.32.48/fs/proc/devices.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/devices.c 2011-11-21 17:40:45.000000000 -0500 @@ -2,6 +2,7 @@ #include #include #include +#include static int devinfo_show(struct seq_file *f, void *v) { @@ -25,6 +26,9 @@ static int devinfo_show(struct seq_file static void *devinfo_start(struct seq_file *f, loff_t *pos) { + if (!ve_is_super(get_exec_env())) + return NULL; + if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) return pos; return NULL; @@ -64,7 +68,7 @@ static const struct file_operations proc static int __init proc_devices_init(void) { - proc_create("devices", 0, NULL, &proc_devinfo_operations); + proc_create("devices", 0, &glob_proc_root, &proc_devinfo_operations); return 0; } module_init(proc_devices_init); diff -urNp linux-2.6.32.48/fs/proc/generic.c linux-2.6.32.48-openvz/fs/proc/generic.c --- linux-2.6.32.48/fs/proc/generic.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/generic.c 2011-11-21 17:40:45.000000000 -0500 @@ -255,6 +255,10 @@ static int proc_notify_change(struct den struct proc_dir_entry *de = PDE(inode); int error; + if ((iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) && + LPDE(inode) == PDE(inode)) + return -EPERM; + error = inode_change_ok(inode, iattr); if (error) goto out; @@ -263,9 +267,12 @@ static int proc_notify_change(struct den if (error) goto out; - de->uid = inode->i_uid; - de->gid = inode->i_gid; - de->mode = inode->i_mode; + if (iattr->ia_valid & ATTR_UID) + de->uid = inode->i_uid; + if (iattr->ia_valid & ATTR_GID) + de->gid = inode->i_gid; + if (iattr->ia_valid & ATTR_MODE) + de->mode = inode->i_mode; out: return error; } @@ -274,11 +281,22 @@ static int proc_getattr(struct vfsmount struct kstat *stat) { struct inode *inode = dentry->d_inode; - struct proc_dir_entry *de = PROC_I(inode)->pde; - if (de && de->nlink) - inode->i_nlink = de->nlink; + struct proc_dir_entry *de = PDE(inode); + struct proc_dir_entry *lde = LPDE(inode); generic_fillattr(inode, stat); + + if (de && de->nlink) + stat->nlink = de->nlink; + /* if dentry is found in both trees and it is a directory + * then inode's nlink count must be altered, because local + * and global subtrees may differ. + * on the other hand, they may intersect, so actual nlink + * value is difficult to calculate - upper estimate is used + * instead of it. + */ + if (lde && lde != de && lde->nlink > 1) + stat->nlink += lde->nlink - 2; return 0; } @@ -411,28 +429,60 @@ static const struct dentry_operations pr .d_delete = proc_delete_dentry, }; +static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir, + const char *name, int namelen) +{ + struct proc_dir_entry *de; + + for (de = dir->subdir; de ; de = de->next) { + if (de->namelen != namelen) + continue; + if (memcmp(de->name, name, namelen)) + continue; + break; + } + return de; +} + /* * Don't create negative dentries here, return -ENOENT by hand * instead. */ -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, - struct dentry *dentry) +struct dentry *proc_lookup_de(struct proc_dir_entry *de, + struct proc_dir_entry *lde, + struct inode *dir, struct dentry *dentry) { struct inode *inode = NULL; int error = -ENOENT; spin_lock(&proc_subdir_lock); - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { + de = __proc_lookup(de, dentry->d_name.name, dentry->d_name.len); + if (lde != NULL) + lde = __proc_lookup(lde, dentry->d_name.name, + dentry->d_name.len); + + if (de == NULL) + de = lde; + + if (de != NULL) { + /* + * de lde meaning inode(g,l) + * ------------------------------------ + * NULL NULL -ENOENT * + * X NULL global X NULL + * NULL X local X X + * X Y both X Y + */ + { unsigned int ino; ino = de->low_ino; de_get(de); + if (lde != NULL) + de_get(lde); spin_unlock(&proc_subdir_lock); error = -EINVAL; - inode = proc_get_inode(dir->i_sb, ino, de); + inode = proc_get_inode(dir->i_sb, ino, de, lde); goto out_unlock; } } @@ -446,13 +496,15 @@ out_unlock: } if (de) de_put(de); + if (lde) + de_put(lde); return ERR_PTR(error); } struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - return proc_lookup_de(PDE(dir), dir, dentry); + return proc_lookup_de(PDE(dir), LPDE(dir), dir, dentry); } /* @@ -464,13 +516,14 @@ struct dentry *proc_lookup(struct inode * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir) +int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lde, + struct file *filp, void *dirent, filldir_t filldir) { unsigned int ino; int i; struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; + struct proc_dir_entry *ode = de, *fde = NULL; ino = inode->i_ino; i = filp->f_pos; @@ -491,25 +544,19 @@ int proc_readdir_de(struct proc_dir_entr /* fall through */ default: spin_lock(&proc_subdir_lock); - de = de->subdir; i -= 2; - for (;;) { - if (!de) { - ret = 1; - spin_unlock(&proc_subdir_lock); - goto out; - } - if (!i) - break; - de = de->next; - i--; - } - - do { +repeat: + de = de->subdir; + while (de != NULL) { struct proc_dir_entry *next; - /* filldir passes info to user space */ de_get(de); + if (i-- > 0 || (fde != NULL && + __proc_lookup(fde, + de->name, de->namelen))) + goto skip; + + /* filldir passes info to user space */ spin_unlock(&proc_subdir_lock); if (filldir(dirent, de->name, de->namelen, filp->f_pos, de->low_ino, de->mode >> 12) < 0) { @@ -518,10 +565,17 @@ int proc_readdir_de(struct proc_dir_entr } spin_lock(&proc_subdir_lock); filp->f_pos++; +skip: next = de->next; de_put(de); de = next; - } while (de); + } + + if (fde == NULL && lde != NULL && lde != ode) { + de = lde; + fde = ode; + goto repeat; + } spin_unlock(&proc_subdir_lock); } ret = 1; @@ -533,7 +587,7 @@ int proc_readdir(struct file *filp, void { struct inode *inode = filp->f_path.dentry->d_inode; - return proc_readdir_de(PDE(inode), filp, dirent, filldir); + return proc_readdir_de(PDE(inode), LPDE(inode), filp, dirent, filldir); } /* diff -urNp linux-2.6.32.48/fs/proc/inode.c linux-2.6.32.48-openvz/fs/proc/inode.c --- linux-2.6.32.48/fs/proc/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include @@ -442,7 +444,7 @@ static const struct file_operations proc #endif struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, - struct proc_dir_entry *de) + struct proc_dir_entry *de, struct proc_dir_entry *lde) { struct inode * inode; @@ -453,6 +455,9 @@ struct inode *proc_get_inode(struct supe inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->fd = 0; PROC_I(inode)->pde = de; +#ifdef CONFIG_VE + PROC_I(inode)->lpde = lde; +#endif if (de->mode) { inode->i_mode = de->mode; @@ -494,9 +499,11 @@ int proc_fill_super(struct super_block * s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; - - de_get(&proc_root); - root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); + + de_get(get_exec_env()->proc_root); + de_get(&glob_proc_root); + root_inode = proc_get_inode(s, PROC_ROOT_INO, + &glob_proc_root, get_exec_env()->proc_root); if (!root_inode) goto out_no_root; root_inode->i_uid = 0; diff -urNp linux-2.6.32.48/fs/proc/internal.h linux-2.6.32.48-openvz/fs/proc/internal.h --- linux-2.6.32.48/fs/proc/internal.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/internal.h 2011-11-21 17:40:45.000000000 -0500 @@ -12,6 +12,12 @@ #include extern struct proc_dir_entry proc_root; +#ifdef CONFIG_VE +extern struct proc_dir_entry glob_proc_root; +#else +#define glob_proc_root proc_root +#endif + #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); #else @@ -80,10 +86,11 @@ static inline int proc_fd(struct inode * return PROC_I(inode)->fd; } -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, +struct dentry *proc_lookup_de(struct proc_dir_entry *de, + struct proc_dir_entry *lpde, struct inode *ino, struct dentry *dentry); -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir); +int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lpde, + struct file *filp, void *dirent, filldir_t filldir); struct pde_opener { struct inode *inode; @@ -106,7 +113,8 @@ void de_put(struct proc_dir_entry *de); extern struct vfsmount *proc_mnt; int proc_fill_super(struct super_block *); -struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); +struct inode *proc_get_inode(struct super_block *, unsigned int, + struct proc_dir_entry *, struct proc_dir_entry *); /* * These are generic /proc routines that use the internal diff -urNp linux-2.6.32.48/fs/proc/kmsg.c linux-2.6.32.48-openvz/fs/proc/kmsg.c --- linux-2.6.32.48/fs/proc/kmsg.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/kmsg.c 2011-11-21 17:40:45.000000000 -0500 @@ -12,6 +12,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -41,19 +45,20 @@ static ssize_t kmsg_read(struct file *fi static unsigned int kmsg_poll(struct file *file, poll_table *wait) { - poll_wait(file, &log_wait, wait); + poll_wait(file, &ve_log_wait, wait); if (do_syslog(9, NULL, 0)) return POLLIN | POLLRDNORM; return 0; } -static const struct file_operations proc_kmsg_operations = { +const struct file_operations proc_kmsg_operations = { .read = kmsg_read, .poll = kmsg_poll, .open = kmsg_open, .release = kmsg_release, }; +EXPORT_SYMBOL_GPL(proc_kmsg_operations); static int __init proc_kmsg_init(void) { diff -urNp linux-2.6.32.48/fs/proc/loadavg.c linux-2.6.32.48-openvz/fs/proc/loadavg.c --- linux-2.6.32.48/fs/proc/loadavg.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/loadavg.c 2011-11-21 17:40:45.000000000 -0500 @@ -13,14 +13,25 @@ static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; + long running, threads; + struct ve_struct *ve; - get_avenrun(avnrun, FIXED_1/200, 0); + ve = get_exec_env(); + if (ve_is_super(ve)) { + get_avenrun(avnrun, FIXED_1/200, 0); + running = nr_running(); + threads = nr_threads; + } else { + get_avenrun_ve(ve, avnrun, FIXED_1/200, 0); + running = nr_running_ve(ve); + threads = atomic_read(&ve->pcounter); + } - seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%ld %d\n", LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), - nr_running(), nr_threads, + running, threads, task_active_pid_ns(current)->last_pid); return 0; } @@ -39,7 +50,7 @@ static const struct file_operations load static int __init proc_loadavg_init(void) { - proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + proc_create("loadavg", 0, &glob_proc_root, &loadavg_proc_fops); return 0; } module_init(proc_loadavg_init); diff -urNp linux-2.6.32.48/fs/proc/meminfo.c linux-2.6.32.48-openvz/fs/proc/meminfo.c --- linux-2.6.32.48/fs/proc/meminfo.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/meminfo.c 2011-11-21 17:40:45.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,9 +20,28 @@ void __attribute__((weak)) arch_report_m { } +#define K(x) ((x) << (PAGE_SHIFT - 10)) + +static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi) +{ + seq_printf(m, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + K(mi->si.totalram), + K(mi->si.freeram), + K(mi->si.totalswap), + K(mi->si.freeswap)); + + return 0; +} + static int meminfo_proc_show(struct seq_file *m, void *v) { + int ret; struct sysinfo i; + struct meminfo mi; unsigned long committed; unsigned long allowed; struct vmalloc_info vmi; @@ -29,12 +49,19 @@ static int meminfo_proc_show(struct seq_ unsigned long pages[NR_LRU_LISTS]; int lru; + si_meminfo(&i); + si_swapinfo(&i); + mi.si = i; + + ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi); + if (ret & NOTIFY_FAIL) + return 0; + if (ret & NOTIFY_OK) + return meminfo_proc_show_mi(m, &mi); + /* * display in kilobytes. */ -#define K(x) ((x) << (PAGE_SHIFT - 10)) - si_meminfo(&i); - si_swapinfo(&i); committed = percpu_counter_read_positive(&vm_committed_as); allowed = ((totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100) + total_swap_pages; @@ -175,7 +202,7 @@ static const struct file_operations memi static int __init proc_meminfo_init(void) { - proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + proc_create("meminfo", 0, &glob_proc_root, &meminfo_proc_fops); return 0; } module_init(proc_meminfo_init); diff -urNp linux-2.6.32.48/fs/proc/proc_net.c linux-2.6.32.48-openvz/fs/proc/proc_net.c --- linux-2.6.32.48/fs/proc/proc_net.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/proc_net.c 2011-11-21 17:40:45.000000000 -0500 @@ -126,7 +126,7 @@ static struct dentry *proc_tgid_net_look de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { - de = proc_lookup_de(net->proc_net, dir, dentry); + de = proc_lookup_de(net->proc_net, NULL, dir, dentry); put_net(net); } return de; @@ -164,7 +164,8 @@ static int proc_tgid_net_readdir(struct ret = -EINVAL; net = get_proc_task_net(filp->f_path.dentry->d_inode); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); + ret = proc_readdir_de(net->proc_net, NULL, + filp, dirent, filldir); put_net(net); } return ret; @@ -234,7 +235,7 @@ static struct pernet_operations __net_in int __init proc_net_init(void) { - proc_symlink("net", NULL, "self/net"); + proc_symlink("net", &glob_proc_root, "self/net"); return register_pernet_subsys(&proc_net_ns_ops); } diff -urNp linux-2.6.32.48/fs/proc/proc_sysctl.c linux-2.6.32.48-openvz/fs/proc/proc_sysctl.c --- linux-2.6.32.48/fs/proc/proc_sysctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/proc_sysctl.c 2011-11-21 17:40:45.000000000 -0500 @@ -406,7 +406,7 @@ int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; - proc_sys_root = proc_mkdir("sys", NULL); + proc_sys_root = proc_mkdir("sys", &glob_proc_root); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_fops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; diff -urNp linux-2.6.32.48/fs/proc/proc_tty.c linux-2.6.32.48-openvz/fs/proc/proc_tty.c --- linux-2.6.32.48/fs/proc/proc_tty.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/proc_tty.c 2011-11-21 17:40:45.000000000 -0500 @@ -13,6 +13,7 @@ #include #include #include +#include #include /* @@ -70,6 +71,9 @@ static int show_tty_driver(struct seq_fi dev_t from = MKDEV(p->major, p->minor_start); dev_t to = from + p->num; + if (!ve_accessible_strict(p->owner_env, get_exec_env())) + goto out; + if (&p->tty_drivers == tty_drivers.next) { /* pseudo-drivers first */ seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); @@ -97,6 +101,7 @@ static int show_tty_driver(struct seq_fi } if (from != to) show_tty_range(m, p, from, to - from); +out: return 0; } diff -urNp linux-2.6.32.48/fs/proc/root.c linux-2.6.32.48-openvz/fs/proc/root.c --- linux-2.6.32.48/fs/proc/root.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/root.c 2011-11-21 17:40:45.000000000 -0500 @@ -42,6 +42,9 @@ static int proc_get_sb(struct file_syste struct super_block *sb; struct pid_namespace *ns; struct proc_inode *ei; +#ifdef CONFIG_VE + struct vfsmount *proc_mnt = fs_type->owner_env->proc_mnt; +#endif if (proc_mnt) { /* Seed the root directory with a pid so it doesn't need @@ -95,11 +98,12 @@ static void proc_kill_sb(struct super_bl put_pid_ns(ns); } -static struct file_system_type proc_fs_type = { +struct file_system_type proc_fs_type = { .name = "proc", .get_sb = proc_get_sb, .kill_sb = proc_kill_sb, }; +EXPORT_SYMBOL(proc_fs_type); void __init proc_root_init(void) { @@ -109,6 +113,11 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; + +#ifdef CONFIG_VE + get_ve0()->proc_root = &proc_root; +#endif + proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); err = PTR_ERR(proc_mnt); if (IS_ERR(proc_mnt)) { @@ -116,16 +125,21 @@ void __init proc_root_init(void) return; } - proc_symlink("mounts", NULL, "self/mounts"); + proc_symlink("mounts", &glob_proc_root, "self/mounts"); +#ifdef CONFIG_VE + get_ve0()->proc_mnt = proc_mnt; +#endif proc_net_init(); #ifdef CONFIG_SYSVIPC - proc_mkdir("sysvipc", NULL); + proc_mkdir("sysvipc", &glob_proc_root); #endif - proc_mkdir("fs", NULL); + proc_mkdir("fs", &glob_proc_root); + proc_mkdir("fs", NULL); /* care about proc_mkdir("fs/xxx", NULL); */ + proc_mkdir("driver", NULL); - proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ + proc_mkdir("fs/nfsd", &glob_proc_root); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ proc_mkdir("openprom", NULL); @@ -141,8 +155,19 @@ void __init proc_root_init(void) static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat ) { + struct ve_struct *ve = get_exec_env(); + generic_fillattr(dentry->d_inode, stat); - stat->nlink = proc_root.nlink + nr_processes(); + stat->nlink = glob_proc_root.nlink; + if (ve_is_super(ve)) + stat->nlink += nr_processes(); +#ifdef CONFIG_VE + else + /* thread count. not really processes count */ + stat->nlink += atomic_read(&ve->pcounter); + /* the same logic as in the proc_getattr */ + stat->nlink += ve->proc_root->nlink - 2; +#endif return 0; } @@ -205,6 +230,22 @@ struct proc_dir_entry proc_root = { .parent = &proc_root, }; +#ifdef CONFIG_VE +struct proc_dir_entry glob_proc_root = { + .low_ino = PROC_ROOT_INO, + .namelen = 5, + .name = "/proc", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, + .count = ATOMIC_INIT(1), + .proc_iops = &proc_root_inode_operations, + .proc_fops = &proc_root_operations, + .parent = &glob_proc_root, +}; + +EXPORT_SYMBOL(glob_proc_root); +#endif + int pid_ns_prepare_proc(struct pid_namespace *ns) { struct vfsmount *mnt; diff -urNp linux-2.6.32.48/fs/proc/stat.c linux-2.6.32.48-openvz/fs/proc/stat.c --- linux-2.6.32.48/fs/proc/stat.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/stat.c 2011-11-21 17:40:45.000000000 -0500 @@ -22,6 +22,62 @@ #define arch_idle_time(cpu) 0 #endif +static int show_stat_ve(struct seq_file *p, struct ve_struct *ve, unsigned long jif) +{ + int i; + u64 user, nice, system; + cycles_t idle, iowait; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + + user = nice = system = idle = iowait = 0; + for_each_cpu_mask(i, ve_cpus) { + user += VE_CPU_STATS(ve, i)->user; + nice += VE_CPU_STATS(ve, i)->nice; + system += VE_CPU_STATS(ve, i)->system; + idle += ve_sched_get_idle_time(ve, i); + iowait += ve_sched_get_iowait_time(ve, i); + } + + seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n", + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cycles_to_clocks(idle), + (unsigned long long)cycles_to_clocks(iowait)); + + for_each_cpu_mask(i, ve_cpus) { + user = VE_CPU_STATS(ve, i)->user; + nice = VE_CPU_STATS(ve, i)->nice; + system = VE_CPU_STATS(ve, i)->system; + idle = ve_sched_get_idle_time(ve, i); + iowait = ve_sched_get_iowait_time(ve, i); + seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n", + i, + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cycles_to_clocks(idle), + (unsigned long long)cycles_to_clocks(iowait)); + } + seq_printf(p, "intr 0\nswap 0 0\n"); + + seq_printf(p, + "\nctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + nr_context_switches(), + (unsigned long)jif + ve->start_timespec.tv_sec, + total_forks, + nr_running_ve(ve), + nr_iowait_ve(ve)); + + return 0; +} + static int show_stat(struct seq_file *p, void *v) { int i, j; @@ -33,12 +89,18 @@ static int show_stat(struct seq_file *p, unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; unsigned int per_irq_sum; + struct ve_struct *ve; + + getboottime(&boottime); + jif = boottime.tv_sec; + + ve = get_exec_env(); + if (!ve_is_super(ve)) + return show_stat_ve(p, ve, jif); user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; guest = cputime64_zero; - getboottime(&boottime); - jif = boottime.tv_sec; for_each_possible_cpu(i) { user = cputime64_add(user, kstat_cpu(i).cpustat.user); @@ -166,7 +228,7 @@ static const struct file_operations proc static int __init proc_stat_init(void) { - proc_create("stat", 0, NULL, &proc_stat_operations); + proc_create("stat", 0, &glob_proc_root, &proc_stat_operations); return 0; } module_init(proc_stat_init); diff -urNp linux-2.6.32.48/fs/proc/uptime.c linux-2.6.32.48-openvz/fs/proc/uptime.c --- linux-2.6.32.48/fs/proc/uptime.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/uptime.c 2011-11-21 17:40:45.000000000 -0500 @@ -19,6 +19,13 @@ static int uptime_proc_show(struct seq_f do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + set_normalized_timespec(&uptime, + uptime.tv_sec - get_exec_env()->start_timespec.tv_sec, + uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec); + } +#endif cputime_to_timespec(idletime, &idle); seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, @@ -42,7 +49,7 @@ static const struct file_operations upti static int __init proc_uptime_init(void) { - proc_create("uptime", 0, NULL, &uptime_proc_fops); + proc_create("uptime", 0, &glob_proc_root, &uptime_proc_fops); return 0; } module_init(proc_uptime_init); diff -urNp linux-2.6.32.48/fs/proc/version.c linux-2.6.32.48-openvz/fs/proc/version.c --- linux-2.6.32.48/fs/proc/version.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/proc/version.c 2011-11-21 17:40:45.000000000 -0500 @@ -28,7 +28,7 @@ static const struct file_operations vers static int __init proc_version_init(void) { - proc_create("version", 0, NULL, &version_proc_fops); + proc_create("version", 0, &glob_proc_root, &version_proc_fops); return 0; } module_init(proc_version_init); diff -urNp linux-2.6.32.48/fs/quota/dquot.c linux-2.6.32.48-openvz/fs/quota/dquot.c --- linux-2.6.32.48/fs/quota/dquot.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/dquot.c 2011-11-21 17:40:45.000000000 -0500 @@ -170,8 +170,9 @@ static struct quota_format_type *find_qu struct quota_format_type *actqf; spin_lock(&dq_list_lock); - for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; - actqf = actqf->qf_next) + for (actqf = quota_formats; + actqf && (actqf->qf_fmt_id != id || actqf->qf_ops == NULL); + actqf = actqf->qf_next) ; if (!actqf || !try_module_get(actqf->qf_owner)) { int qm; diff -urNp linux-2.6.32.48/fs/quota/Kconfig linux-2.6.32.48-openvz/fs/quota/Kconfig --- linux-2.6.32.48/fs/quota/Kconfig 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/Kconfig 2011-11-21 17:40:45.000000000 -0500 @@ -26,13 +26,22 @@ config QUOTA_NETLINK_INTERFACE config PRINT_QUOTA_WARNING bool "Print quota warnings to console (OBSOLETE)" depends on QUOTA - default y + default n help If you say Y here, quota warnings (about exceeding softlimit, reaching hardlimit, etc.) will be printed to the process' controlling terminal. Note that this behavior is currently deprecated and may go away in future. Please use notification via netlink socket instead. +config QUOTA_COMPAT + bool "Compatibility with older quotactl interface" + depends on QUOTA + help + This option enables compatibility layer for older version + of quotactl interface with byte granularity (QUOTAON at 0x0100, + GETQUOTA at 0x0D00). Interface versions older than that one and + with block granularity are still not supported. + # Generic support for tree structured quota files. Selected when needed. config QUOTA_TREE tristate @@ -53,6 +62,31 @@ config QFMT_V2 This quota format allows using quotas with 32-bit UIDs/GIDs. If you need this functionality say Y here. +config VZ_QUOTA + tristate "Virtuozzo Disk Quota support" + select QUOTA + select QUOTA_COMPAT + select VZ_DEV + default m + help + Virtuozzo Disk Quota imposes disk quota on directories with their + files and subdirectories in total. Such disk quota is used to + account and limit disk usage by Virtuozzo VPS, but also may be used + separately. + +config VZ_QUOTA_UNLOAD + bool "Unloadable Virtuozzo Disk Quota module" + depends on VZ_QUOTA=m + default n + help + Make Virtuozzo Disk Quota module unloadable. + Doesn't work reliably now. + +config VZ_QUOTA_UGID + bool "Per-user and per-group quota in Virtuozzo quota partitions" + depends on VZ_QUOTA!=n + default y + config QUOTACTL bool depends on XFS_QUOTA || QUOTA diff -urNp linux-2.6.32.48/fs/quota/Makefile linux-2.6.32.48-openvz/fs/quota/Makefile --- linux-2.6.32.48/fs/quota/Makefile 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/Makefile 2011-11-21 17:40:45.000000000 -0500 @@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o + +obj-y += vzdquota/ diff -urNp linux-2.6.32.48/fs/quota/quota.c linux-2.6.32.48-openvz/fs/quota/quota.c --- linux-2.6.32.48/fs/quota/quota.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/quota.c 2011-11-21 17:40:45.000000000 -0500 @@ -18,6 +18,7 @@ #include #include #include +#include /* Check validity of generic quotactl commands */ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, @@ -83,11 +84,11 @@ static int generic_quotactl_valid(struct if (cmd == Q_GETQUOTA) { if (((type == USRQUOTA && current_euid() != id) || (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_VE_SYS_ADMIN)) return -EPERM; } else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; return 0; @@ -135,10 +136,10 @@ static int xqm_quotactl_valid(struct sup if (cmd == Q_XGETQUOTA) { if (((type == XQM_USRQUOTA && current_euid() != id) || (type == XQM_GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_VE_SYS_ADMIN)) return -EPERM; } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; } @@ -164,7 +165,7 @@ void sync_quota_sb(struct super_block *s { int cnt; - if (!sb->s_qcop->quota_sync) + if (!sb->s_qcop || !sb->s_qcop->quota_sync) return; sb->s_qcop->quota_sync(sb, type); @@ -188,6 +189,8 @@ void sync_quota_sb(struct super_block *s continue; if (!sb_has_quota_active(sb, cnt)) continue; + if (!sb_dqopt(sb)->files[cnt]) + continue; mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); @@ -361,6 +364,7 @@ static struct super_block *quotactl_bloc struct block_device *bdev; struct super_block *sb; char *tmp = getname(special); + int error; if (IS_ERR(tmp)) return ERR_CAST(tmp); @@ -368,6 +372,13 @@ static struct super_block *quotactl_bloc putname(tmp); if (IS_ERR(bdev)) return ERR_CAST(bdev); + + error = devcgroup_inode_permission(bdev->bd_inode, MAY_QUOTACTL); + if (error) { + bdput(bdev); + return ERR_PTR(error); + } + sb = get_super(bdev); bdput(bdev); if (!sb) @@ -379,6 +390,231 @@ static struct super_block *quotactl_bloc #endif } +#ifdef CONFIG_QUOTA_COMPAT + +#define QC_QUOTAON 0x0100 /* enable quotas */ +#define QC_QUOTAOFF 0x0200 /* disable quotas */ +/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */ +#define QC_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ +#define QC_SETQLIM 0x0700 /* set limits */ +/* GETSTATS at 0x0800 is now longer... */ +#define QC_GETINFO 0x0900 /* get info about quotas - graces, flags... */ +#define QC_SETINFO 0x0A00 /* set info about quotas */ +#define QC_SETGRACE 0x0B00 /* set inode and block grace */ +#define QC_SETFLAGS 0x0C00 /* set flags for quota */ +#define QC_GETQUOTA 0x0D00 /* get limits and usage */ +#define QC_SETQUOTA 0x0E00 /* set limits and usage */ +#define QC_SETUSE 0x0F00 /* set usage */ +/* 0x1000 used by old RSQUASH */ +#define QC_GETSTATS 0x1100 /* get collected stats */ + +struct compat_dqblk { + unsigned int dqb_ihardlimit; + unsigned int dqb_isoftlimit; + unsigned int dqb_curinodes; + unsigned int dqb_bhardlimit; + unsigned int dqb_bsoftlimit; + qsize_t dqb_curspace; + __kernel_time_t dqb_btime; + __kernel_time_t dqb_itime; +}; + +#ifdef CONFIG_COMPAT + +struct compat_compat_dqblk { + compat_uint_t dqb_ihardlimit; + compat_uint_t dqb_isoftlimit; + compat_uint_t dqb_curinodes; + compat_uint_t dqb_bhardlimit; + compat_uint_t dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_time_t dqb_btime; + compat_time_t dqb_itime; +}; + +#endif + +struct compat_dqinfo { + unsigned int dqi_bgrace; + unsigned int dqi_igrace; + unsigned int dqi_flags; + unsigned int dqi_blocks; + unsigned int dqi_free_blk; + unsigned int dqi_free_entry; +}; + +struct compat_dqstats { + __u32 lookups; + __u32 drops; + __u32 reads; + __u32 writes; + __u32 cache_hits; + __u32 allocated_dquots; + __u32 free_dquots; + __u32 syncs; + __u32 version; +}; + +asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); + +static long compat_quotactl(unsigned int cmds, unsigned int type, + const char __user *special, qid_t id, + void __user *addr) +{ + struct super_block *sb; + long ret; + + sb = NULL; + switch (cmds) { + case QC_QUOTAON: + return sys_quotactl(QCMD(Q_QUOTAON, type), + special, id, addr); + + case QC_QUOTAOFF: + return sys_quotactl(QCMD(Q_QUOTAOFF, type), + special, id, addr); + + case QC_SYNC: + return sys_quotactl(QCMD(Q_SYNC, type), + special, id, addr); + + case QC_GETQUOTA: { + struct if_dqblk idq; + struct compat_dqblk cdq; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); + if (ret) + break; + ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + if (ret) + break; + cdq.dqb_ihardlimit = idq.dqb_ihardlimit; + cdq.dqb_isoftlimit = idq.dqb_isoftlimit; + cdq.dqb_curinodes = idq.dqb_curinodes; + cdq.dqb_bhardlimit = idq.dqb_bhardlimit; + cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit; + cdq.dqb_curspace = idq.dqb_curspace; + cdq.dqb_btime = idq.dqb_btime; + cdq.dqb_itime = idq.dqb_itime; + ret = 0; + if (copy_to_user(addr, &cdq, sizeof(cdq))) + ret = -EFAULT; + break; + } + + case QC_SETQUOTA: + case QC_SETUSE: + case QC_SETQLIM: { + struct if_dqblk idq; + struct compat_dqblk cdq; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id); + if (ret) + break; + ret = -EFAULT; + if (copy_from_user(&cdq, addr, sizeof(cdq))) + break; + idq.dqb_ihardlimit = cdq.dqb_ihardlimit; + idq.dqb_isoftlimit = cdq.dqb_isoftlimit; + idq.dqb_curinodes = cdq.dqb_curinodes; + idq.dqb_bhardlimit = cdq.dqb_bhardlimit; + idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit; + idq.dqb_curspace = cdq.dqb_curspace; + idq.dqb_valid = 0; + if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM) + idq.dqb_valid |= QIF_LIMITS; + if (cmds == QC_SETQUOTA || cmds == QC_SETUSE) + idq.dqb_valid |= QIF_USAGE; + ret = sb->s_qcop->set_dqblk(sb, type, id, &idq); + break; + } + + case QC_GETINFO: { + struct if_dqinfo iinf; + struct compat_dqinfo cinf; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); + if (ret) + break; + ret = sb->s_qcop->get_info(sb, type, &iinf); + if (ret) + break; + cinf.dqi_bgrace = iinf.dqi_bgrace; + cinf.dqi_igrace = iinf.dqi_igrace; + cinf.dqi_flags = 0; + if (iinf.dqi_flags & DQF_INFO_DIRTY) + cinf.dqi_flags |= 0x0010; + cinf.dqi_blocks = 0; + cinf.dqi_free_blk = 0; + cinf.dqi_free_entry = 0; + ret = 0; + if (copy_to_user(addr, &cinf, sizeof(cinf))) + ret = -EFAULT; + break; + } + + case QC_SETINFO: + case QC_SETGRACE: + case QC_SETFLAGS: { + struct if_dqinfo iinf; + struct compat_dqinfo cinf; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_SETINFO, id); + if (ret) + break; + ret = -EFAULT; + if (copy_from_user(&cinf, addr, sizeof(cinf))) + break; + iinf.dqi_bgrace = cinf.dqi_bgrace; + iinf.dqi_igrace = cinf.dqi_igrace; + iinf.dqi_flags = cinf.dqi_flags; + iinf.dqi_valid = 0; + if (cmds == QC_SETINFO || cmds == QC_SETGRACE) + iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE; + if (cmds == QC_SETINFO || cmds == QC_SETFLAGS) + iinf.dqi_valid |= IIF_FLAGS; + ret = sb->s_qcop->set_info(sb, type, &iinf); + break; + } + + case QC_GETSTATS: { + struct compat_dqstats stat; + + memset(&stat, 0, sizeof(stat)); + stat.version = 6*10000+5*100+0; + ret = 0; + if (copy_to_user(addr, &stat, sizeof(stat))) + ret = -EFAULT; + break; + } + + default: + ret = -ENOSYS; + break; + } + if (sb && !IS_ERR(sb)) + drop_super(sb); + return ret; +} + +#endif + /* * This is the system call interface. This communicates with * the user-level programs. Currently this only supports diskquota @@ -395,6 +631,11 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; +#ifdef CONFIG_QUOTA_COMPAT + if (cmds >= 0x0100 && cmds < 0x3000) + return compat_quotactl(cmds, type, special, id, addr); +#endif + if (cmds != Q_SYNC || special) { sb = quotactl_block(special); if (IS_ERR(sb)) @@ -459,6 +700,11 @@ asmlinkage long sys32_quotactl(unsigned compat_uint_t data; u16 xdata; long ret; +#ifdef CONFIG_QUOTA_COMPAT + struct compat_dqblk __user *cdq; + struct compat_compat_dqblk __user *compat_cdq; + compat_time_t time; +#endif cmds = cmd >> SUBCMDSHIFT; @@ -519,6 +765,43 @@ asmlinkage long sys32_quotactl(unsigned break; ret = 0; break; +#ifdef CONFIG_QUOTA_COMPAT + case QC_GETQUOTA: + cdq = compat_alloc_user_space(sizeof(struct compat_dqblk)); + compat_cdq = addr; + ret = sys_quotactl(cmd, special, id, cdq); + if (ret) + break; + ret = -EFAULT; + if (copy_in_user(compat_cdq, cdq, sizeof(struct compat_compat_dqblk) - + offsetof(struct compat_compat_dqblk, dqb_curspace)) || + copy_in_user(&compat_cdq->dqb_curspace, &cdq->dqb_curspace, + sizeof(cdq->dqb_curspace)) || + get_user(time, &cdq->dqb_btime) || + put_user(time, &compat_cdq->dqb_btime) || + get_user(time, &cdq->dqb_itime) || + put_user(time, &compat_cdq->dqb_itime)) + break; + ret = 0; + break; + case QC_SETQUOTA: + case QC_SETUSE: + case QC_SETQLIM: + cdq = compat_alloc_user_space(sizeof(struct compat_dqblk)); + compat_cdq = addr; + ret = -EFAULT; + if (copy_in_user(cdq, compat_cdq, sizeof(struct compat_compat_dqblk) - + offsetof(struct compat_compat_dqblk, dqb_curspace)) || + copy_in_user(&cdq->dqb_curspace, &compat_cdq->dqb_curspace, + sizeof(cdq->dqb_curspace)) || + get_user(time, &compat_cdq->dqb_btime) || + put_user(time, &cdq->dqb_btime) || + get_user(time, &compat_cdq->dqb_itime) || + put_user(time, &cdq->dqb_itime)) + break; + ret = sys_quotactl(cmd, special, id, cdq); + break; +#endif default: ret = sys_quotactl(cmd, special, id, addr); } diff -urNp linux-2.6.32.48/fs/quota/vzdquota/Makefile linux-2.6.32.48-openvz/fs/quota/vzdquota/Makefile --- linux-2.6.32.48/fs/quota/vzdquota/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/vzdquota/Makefile 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,4 @@ +obj-$(CONFIG_VZ_QUOTA) += vzdquota.o +vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o diff -urNp linux-2.6.32.48/fs/quota/vzdquota/vzdq_file.c linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_file.c --- linux-2.6.32.48/fs/quota/vzdquota/vzdq_file.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_file.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,956 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo quota files as proc entry implementation. + * It is required for std quota tools to work correctly as they are expecting + * aquota.user and aquota.group files. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../quotaio_v2.h" +#include "../quota_tree.h" +#include + +#include +#include +#include +#include +#include + +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) + +/* ---------------------------------------------------------------------- + * + * File read operation + * + * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, + * perhaps) abuse vz_quota_mutex. + * Taking a global mutex for lengthy and user-controlled operations inside + * VPSs is not a good idea in general. + * In this case, the reasons for taking this mutex are completely unclear, + * especially taking into account that the only function that has comments + * about the necessity to be called under this mutex + * (create_proc_quotafile) is actually called OUTSIDE it. + * + * --------------------------------------------------------------------- */ + +#define DQBLOCK_SIZE 1024 +#define DQUOTBLKNUM 21U +#define DQTREE_DEPTH 4 +#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1) +#define ISINDBLOCK(num) ((num)%2 != 0) +#define FIRST_DATABLK 2 /* first even number */ +#define LAST_IND_LEVEL (DQTREE_DEPTH - 1) +#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS)) +#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \ + & QUOTATREE_BMASK) + +#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH) +#error xBITS and DQTREE_DEPTH does not correspond +#endif + +#define BLOCK_NOT_FOUND 1 + +/* data for quota file -- one per proc entry */ +struct quotatree_data { + struct list_head list; + struct vz_quota_master *qmblk; + int type; /* type of the tree */ +}; + +/* serialized by vz_quota_mutex */ +static LIST_HEAD(qf_data_head); + +static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; +static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS; +static const char aquota_user[] = "aquota.user"; +static const char aquota_group[] = "aquota.group"; + + +static inline loff_t get_depoff(int depth) +{ + loff_t res = 1; + while (depth) { + res += (1 << ((depth - 1)*QUOTAID_EBITS + 1)); + depth--; + } + return res; +} + +static inline loff_t get_blknum(loff_t num, int depth) +{ + loff_t res; + res = (num << 1) + get_depoff(depth); + return res; +} + +static int get_depth(loff_t num) +{ + int i; + for (i = 0; i < DQTREE_DEPTH; i++) { + if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1 + || num < get_depoff(i + 1))) + return i; + } + return -1; +} + +static inline loff_t get_offset(loff_t num) +{ + loff_t res, tmp; + + tmp = get_depth(num); + if (tmp < 0) + return -1; + num -= get_depoff(tmp); + BUG_ON(num < 0); + res = num >> 1; + + return res; +} + +static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level) +{ + /* return maximum available block num */ + return tree->levels[level].freenum; +} + +static inline loff_t get_block_num(struct quotatree_tree *tree) +{ + loff_t ind_blk_num, quot_blk_num, max_ind, max_quot; + + quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1); + max_quot = TREENUM_2_BLKNUM(quot_blk_num); + ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1)); + max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL) + : get_blknum(ind_blk_num, 0); + + return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1; +} + +/* Write quota file header */ +static int read_header(void *buf, struct quotatree_tree *tree, + struct dq_info *dq_ugid_info, int type) +{ + struct v2_disk_dqheader *dqh; + struct v2_disk_dqinfo *dq_disk_info; + + dqh = buf; + dq_disk_info = buf + sizeof(struct v2_disk_dqheader); + + dqh->dqh_magic = vzquota_magics[type]; + dqh->dqh_version = vzquota_versions[type]; + + dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire; + dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire; + dq_disk_info->dqi_flags = 0; /* no flags */ + dq_disk_info->dqi_blocks = get_block_num(tree); + dq_disk_info->dqi_free_blk = 0; /* first block in the file */ + dq_disk_info->dqi_free_entry = FIRST_DATABLK; + + return 0; +} + +static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf) +{ + int i, j, lev_num; + + lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1; + for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) { + struct quotatree_node *next, *parent; + + parent = p; + next = p; + for (j = lev_num; j >= 0; j--) { + if (!next->blocks[GETLEVINDX(i,j)]) { + buf[i] = 0; + goto bad_branch; + } + parent = next; + next = next->blocks[GETLEVINDX(i,j)]; + } + buf[i] = (depth == DQTREE_DEPTH - 1) ? + TREENUM_2_BLKNUM(parent->num) + : get_blknum(next->num, depth + 1); + + bad_branch: + ; + } + + return 0; +} + +/* + * Write index block to disk (or buffer) + * @buf has length 256*sizeof(u_int32_t) bytes + */ +static int read_index_block(int num, u_int32_t *buf, + struct quotatree_tree *tree) +{ + struct quotatree_node *p; + u_int32_t index; + loff_t off; + int depth, res; + + res = BLOCK_NOT_FOUND; + index = 0; + depth = get_depth(num); + off = get_offset(num); + if (depth < 0 || off < 0) + return -EINVAL; + + list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh, + list) { + if (p->num >= off) + res = 0; + if (p->num != off) + continue; + get_block_child(depth, p, buf); + break; + } + + return res; +} + +static inline void convert_quot_format(struct v2_disk_dqblk *dq, + struct vz_quota_ugid *vzq) +{ + dq->dqb_id = vzq->qugid_id; + dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit; + dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit; + dq->dqb_curinodes = vzq->qugid_stat.icurrent; + dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE; + dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE; + dq->dqb_curspace = vzq->qugid_stat.bcurrent; + dq->dqb_btime = vzq->qugid_stat.btime; + dq->dqb_itime = vzq->qugid_stat.itime; +} + +static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree) +{ + int res, i, entries = 0; + struct qt_disk_dqdbheader *dq_header; + struct quotatree_node *p; + struct v2_disk_dqblk *blk = buf + sizeof(struct qt_disk_dqdbheader); + + res = BLOCK_NOT_FOUND; + dq_header = buf; + memset(dq_header, 0, sizeof(*dq_header)); + + list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh), + list) { + if (TREENUM_2_BLKNUM(p->num) >= num) + res = 0; + if (TREENUM_2_BLKNUM(p->num) != num) + continue; + + for (i = 0; i < QUOTATREE_BSIZE; i++) { + if (!p->blocks[i]) + continue; + convert_quot_format(blk + entries, + (struct vz_quota_ugid *)p->blocks[i]); + entries++; + res = 0; + } + break; + } + dq_header->dqdh_entries = entries; + + return res; +} + +static int read_block(int num, void *buf, struct quotatree_tree *tree, + struct dq_info *dq_ugid_info, int magic) +{ + int res; + + memset(buf, 0, DQBLOCK_SIZE); + if (!num) + res = read_header(buf, tree, dq_ugid_info, magic); + else if (ISINDBLOCK(num)) + res = read_index_block(num, (u_int32_t*)buf, tree); + else + res = read_dquot(num, buf, tree); + + return res; +} + +/* + * FIXME: this function can handle quota files up to 2GB only. + */ +static int read_proc_quotafile(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + off_t blk_num, blk_off, buf_off; + char *tmp; + size_t buf_size; + struct quotatree_data *qtd; + struct quotatree_tree *tree; + struct dq_info *dqi; + int res; + + *start = NULL; + tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + qtd = data; + mutex_lock(&vz_quota_mutex); + mutex_lock(&qtd->qmblk->dq_mutex); + + res = 0; + tree = QUGID_TREE(qtd->qmblk, qtd->type); + if (!tree) { + *eof = 1; + goto out_dq; + } + + dqi = &qtd->qmblk->dq_ugid_info[qtd->type]; + + buf_off = 0; + buf_size = count; + blk_num = off / DQBLOCK_SIZE; + blk_off = off % DQBLOCK_SIZE; + + while (buf_size > 0) { + off_t len; + + len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size); + res = read_block(blk_num, tmp, tree, dqi, qtd->type); + if (res < 0) + goto out_err; + if (res == BLOCK_NOT_FOUND) { + *eof = 1; + break; + } + memcpy(page + buf_off, tmp + blk_off, len); + + blk_num++; + buf_size -= len; + blk_off = 0; + buf_off += len; + } + res = buf_off; + +out_err: + *start += count; +out_dq: + mutex_unlock(&qtd->qmblk->dq_mutex); + mutex_unlock(&vz_quota_mutex); + kfree(tmp); + + return res; +} + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota/QID/aquota.* files + * + * FIXME: this code lacks serialization of read/readdir/lseek. + * However, this problem should be fixed after the mainstream issue of what + * appears to be non-atomic read and update of file position in sys_read. + * + * --------------------------------------------------------------------- */ + +static inline unsigned long vzdq_aquot_getino(dev_t dev) +{ + return 0xec000000UL + dev; +} + +static inline dev_t vzdq_aquot_getidev(struct inode *inode) +{ + return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link; +} + +static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev) +{ + PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev; +} + +static ssize_t vzdq_aquotf_read(struct file *file, + char __user *buf, size_t size, loff_t *ppos) +{ + char *page; + size_t bufsize; + ssize_t l, l2, copied; + char *start; + struct inode *inode; + struct block_device *bdev; + struct super_block *sb; + struct quotatree_data data; + int eof, err; + + err = -ENOMEM; + page = (char *)__get_free_page(GFP_KERNEL); + if (page == NULL) + goto out_err; + + err = -ENODEV; + inode = file->f_dentry->d_inode; + bdev = bdget(vzdq_aquot_getidev(inode)); + if (bdev == NULL) + goto out_err; + sb = get_super(bdev); + bdput(bdev); + if (sb == NULL) + goto out_err; + data.qmblk = vzquota_find_qmblk(sb); + data.type = PROC_I(inode)->fd - 1; + drop_super(sb); + if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD) + goto out_err; + + copied = 0; + l = l2 = 0; + while (1) { + bufsize = min(size, (size_t)PAGE_SIZE); + if (bufsize <= 0) + break; + + l = read_proc_quotafile(page, &start, *ppos, bufsize, + &eof, &data); + if (l <= 0) + break; + + l2 = copy_to_user(buf, page, l); + copied += l - l2; + if (l2) + break; + + buf += l; + size -= l; + *ppos += (unsigned long)start; + l = l2 = 0; + } + + qmblk_put(data.qmblk); + free_page((unsigned long)page); + if (copied) + return copied; + else if (l2) /* last copy_to_user failed */ + return -EFAULT; + else /* read error or EOF */ + return l; + +out_err: + if (page != NULL) + free_page((unsigned long)page); + return err; +} + +static struct file_operations vzdq_aquotf_file_operations = { + .read = &vzdq_aquotf_read, +}; + +static struct inode_operations vzdq_aquotf_inode_operations = { +}; + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota/QID directory + * + * --------------------------------------------------------------------- */ + +static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler) +{ + loff_t n; + int err; + + n = file->f_pos; + for (err = 0; !err; n++) { + /* ppc32 can't cmp 2 long long's in switch, calls __cmpdi2() */ + switch ((unsigned long)n) { + case 0: + err = (*filler)(data, ".", 1, n, + file->f_dentry->d_inode->i_ino, + DT_DIR); + break; + case 1: + err = (*filler)(data, "..", 2, n, + parent_ino(file->f_dentry), DT_DIR); + break; + case 2: + err = (*filler)(data, aquota_user, + sizeof(aquota_user)-1, n, + file->f_dentry->d_inode->i_ino + + USRQUOTA + 1, + DT_REG); + break; + case 3: + err = (*filler)(data, aquota_group, + sizeof(aquota_group)-1, n, + file->f_dentry->d_inode->i_ino + + GRPQUOTA + 1, + DT_REG); + break; + default: + goto out; + } + } +out: + file->f_pos = n; + return err; +} + +struct vzdq_aquotq_lookdata { + dev_t dev; + int type; + struct vz_quota_master *qmblk; +}; + +static int vzdq_aquotq_looktest(struct inode *inode, void *data) +{ + struct vzdq_aquotq_lookdata *d; + + d = data; + return inode->i_op == &vzdq_aquotf_inode_operations && + vzdq_aquot_getidev(inode) == d->dev && + PROC_I(inode)->fd == d->type + 1; +} + +static int vzdq_aquotq_lookset(struct inode *inode, void *data) +{ + struct vzdq_aquotq_lookdata *d; + struct super_block *sb; + struct quotatree_data qtd; + struct quotatree_tree *tree; + + d = data; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_nlink = 1; + inode->i_op = &vzdq_aquotf_inode_operations; + inode->i_fop = &vzdq_aquotf_file_operations; + PROC_I(inode)->fd = d->type + 1; + vzdq_aquot_setidev(inode, d->dev); + + /* Setting size */ + sb = user_get_super(d->dev); + if (sb == NULL) + return -ENODEV; + qtd.qmblk = vzquota_find_qmblk(sb); + drop_super(sb); + + if (qtd.qmblk == NULL) + return -ESRCH; + if (qtd.qmblk == VZ_QUOTA_BAD) + return -EIO; + + qtd.type = PROC_I(inode)->fd - 1; + tree = QUGID_TREE(qtd.qmblk, qtd.type); + inode->i_size = get_block_num(tree) * 1024; + return 0; +} + +static int vzdq_aquotq_revalidate(struct dentry *vdentry, struct nameidata *nd) +{ + return 0; +} + +static struct dentry_operations vzdq_aquotq_dentry_operations = { + .d_revalidate = &vzdq_aquotq_revalidate, +}; + +static struct vz_quota_master *find_qmblk_by_dev(dev_t dev) +{ + struct super_block *sb; + struct vz_quota_master *qmblk; + + qmblk = NULL; + sb = user_get_super(dev); + if (sb != NULL) { + qmblk = vzquota_find_qmblk(sb); + drop_super(sb); + + if (qmblk == VZ_QUOTA_BAD) + qmblk = NULL; + } + + return qmblk; +} + +static struct dentry *vzdq_aquotq_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + struct vzdq_aquotq_lookdata d; + int k; + + if (dentry->d_name.len == sizeof(aquota_user)-1) { + if (memcmp(dentry->d_name.name, aquota_user, + sizeof(aquota_user)-1)) + goto out; + k = USRQUOTA; + } else if (dentry->d_name.len == sizeof(aquota_group)-1) { + if (memcmp(dentry->d_name.name, aquota_group, + sizeof(aquota_group)-1)) + goto out; + k = GRPQUOTA; + } else + goto out; + d.dev = vzdq_aquot_getidev(dir); + d.type = k; + d.qmblk = find_qmblk_by_dev(d.dev); + if (d.qmblk == NULL) + goto out; + + inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1, + vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d); + if (inode == NULL) + goto out; + unlock_new_inode(inode); + dentry->d_op = &vzdq_aquotq_dentry_operations; + d_add(dentry, inode); + return NULL; + +out: + return ERR_PTR(-ENOENT); +} + +static struct file_operations vzdq_aquotq_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotq_readdir, +}; + +static struct inode_operations vzdq_aquotq_inode_operations = { + .lookup = &vzdq_aquotq_lookup, +}; + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota directory + * + * --------------------------------------------------------------------- */ + +struct vzdq_aquot_de { + struct list_head list; + struct vfsmount *mnt; +}; + +static int vzdq_aquot_buildmntlist(struct ve_struct *ve, + struct list_head *head) +{ + struct vfsmount *rmnt, *mnt; + struct vzdq_aquot_de *p; + int err; + +#ifdef CONFIG_VE + rmnt = mntget(ve->root_path.mnt); +#else + read_lock(¤t->fs->lock); + rmnt = mntget(current->fs->rootmnt); + read_unlock(¤t->fs->lock); +#endif + mnt = rmnt; + spin_lock(&vfsmount_lock); + while (1) { + list_for_each_entry(p, head, list) { + if (p->mnt->mnt_sb == mnt->mnt_sb) + goto skip; + } + + err = -ENOMEM; + p = kmalloc(sizeof(*p), GFP_ATOMIC); + if (p == NULL) + goto out; + p->mnt = mntget(mnt); + list_add_tail(&p->list, head); + +skip: + err = 0; + if (list_empty(&mnt->mnt_mounts)) { + while (1) { + if (mnt == rmnt) + goto out; + if (mnt->mnt_child.next != + &mnt->mnt_parent->mnt_mounts) + break; + mnt = mnt->mnt_parent; + } + mnt = list_entry(mnt->mnt_child.next, + struct vfsmount, mnt_child); + } else + mnt = list_entry(mnt->mnt_mounts.next, + struct vfsmount, mnt_child); + } +out: + spin_unlock(&vfsmount_lock); + mntput(rmnt); + return err; +} + +static void vzdq_aquot_releasemntlist(struct ve_struct *ve, + struct list_head *head) +{ + struct vzdq_aquot_de *p; + + while (!list_empty(head)) { + p = list_entry(head->next, typeof(*p), list); + mntput(p->mnt); + list_del(&p->list); + kfree(p); + } +} + +static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler) +{ + struct ve_struct *ve, *old_ve; + struct list_head mntlist; + struct vzdq_aquot_de *de; + struct super_block *sb; + struct vz_quota_master *qmblk; + loff_t i, n; + char buf[24]; + int l, err; + + i = 0; + n = file->f_pos; + ve = file->f_dentry->d_sb->s_type->owner_env; + old_ve = set_exec_env(ve); + + INIT_LIST_HEAD(&mntlist); +#ifdef CONFIG_VE + /* + * The only reason of disabling readdir for the host system is that + * this readdir can be slow and CPU consuming with large number of VPSs + * (or just mount points). + */ + err = ve_is_super(ve); +#else + err = 0; +#endif + if (!err) { + err = vzdq_aquot_buildmntlist(ve, &mntlist); + if (err) + goto out_err; + } + + if (i >= n) { + if ((*filler)(data, ".", 1, i, + file->f_dentry->d_inode->i_ino, DT_DIR)) + goto out_fill; + } + i++; + + if (i >= n) { + if ((*filler)(data, "..", 2, i, + parent_ino(file->f_dentry), DT_DIR)) + goto out_fill; + } + i++; + + list_for_each_entry (de, &mntlist, list) { + sb = de->mnt->mnt_sb; + if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL)) + continue; + + qmblk = vzquota_find_qmblk(sb); + if (qmblk == NULL || qmblk == VZ_QUOTA_BAD) + continue; + + qmblk_put(qmblk); + i++; + if (i <= n) + continue; + + l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev)); + if ((*filler)(data, buf, l, i - 1, + vzdq_aquot_getino(sb->s_dev), DT_DIR)) + break; + } + +out_fill: + err = 0; + file->f_pos = i; +out_err: + vzdq_aquot_releasemntlist(ve, &mntlist); + (void)set_exec_env(old_ve); + return err; +} + +static int vzdq_aquotd_looktest(struct inode *inode, void *data) +{ + return inode->i_op == &vzdq_aquotq_inode_operations && + vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data; +} + +static int vzdq_aquotd_lookset(struct inode *inode, void *data) +{ + dev_t dev; + + dev = (dev_t)(unsigned long)data; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = vzdq_aquot_getino(dev); + inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_nlink = 2; + inode->i_op = &vzdq_aquotq_inode_operations; + inode->i_fop = &vzdq_aquotq_file_operations; + vzdq_aquot_setidev(inode, dev); + return 0; +} + +static struct dentry *vzdq_aquotd_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct ve_struct *ve, *old_ve; + const unsigned char *s; + int l; + dev_t dev; + struct inode *inode; + + ve = dir->i_sb->s_type->owner_env; + old_ve = set_exec_env(ve); +#ifdef CONFIG_VE + /* + * Lookup is much lighter than readdir, so it can be allowed for the + * host system. But it would be strange to be able to do lookup only + * without readdir... + */ + if (ve_is_super(ve)) + goto out; +#endif + + dev = 0; + l = dentry->d_name.len; + if (l <= 0) + goto out; + for (s = dentry->d_name.name; l > 0; s++, l--) { + if (!isxdigit(*s)) + goto out; + if (dev & ~(~0UL >> 4)) + goto out; + dev <<= 4; + if (isdigit(*s)) + dev += *s - '0'; + else if (islower(*s)) + dev += *s - 'a' + 10; + else + dev += *s - 'A' + 10; + } + dev = new_decode_dev(dev); + + if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL)) + goto out; + + inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev), + vzdq_aquotd_looktest, vzdq_aquotd_lookset, + (void *)(unsigned long)dev); + if (inode == NULL) + goto out; + unlock_new_inode(inode); + + d_add(dentry, inode); + (void)set_exec_env(old_ve); + return NULL; + +out: + (void)set_exec_env(old_ve); + return ERR_PTR(-ENOENT); +} + +static int vzdq_aquotd_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct ve_struct *ve, *old_ve; + struct list_head mntlist, *pos; + + generic_fillattr(dentry->d_inode, stat); + ve = dentry->d_sb->s_type->owner_env; +#ifdef CONFIG_VE + /* + * The only reason of disabling getattr for the host system is that + * this getattr can be slow and CPU consuming with large number of VPSs + * (or just mount points). + */ + if (ve_is_super(ve)) + return 0; +#endif + INIT_LIST_HEAD(&mntlist); + old_ve = set_exec_env(ve); + if (!vzdq_aquot_buildmntlist(ve, &mntlist)) + list_for_each(pos, &mntlist) + stat->nlink++; + vzdq_aquot_releasemntlist(ve, &mntlist); + (void)set_exec_env(old_ve); + return 0; +} + +static struct file_operations vzdq_aquotd_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotd_readdir, +}; + +static struct inode_operations vzdq_aquotd_inode_operations = { + .lookup = &vzdq_aquotd_lookup, + .getattr = &vzdq_aquotd_getattr, +}; + + +/* ---------------------------------------------------------------------- + * + * Initialization and deinitialization + * + * --------------------------------------------------------------------- */ +static int fake_data; +static struct ctl_table fake_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = ".fake", + .mode = 0600, + .proc_handler = proc_dointvec, + .data = &fake_data, + .maxlen = sizeof(int), + }, + { } +}; + +static struct ctl_path fake_path[] = { + { .ctl_name = CTL_FS, .procname = "fs", }, + { .ctl_name = FS_DQSTATS, .procname = "quota", }, + { } +}; + +/* + * FIXME: creation of proc entries here is unsafe with respect to module + * unloading. + */ +void vzaquota_init(void) +{ + struct proc_dir_entry *de; + + de = proc_create("vzaquota", S_IFDIR | S_IRUSR | S_IXUSR, + glob_proc_vz_dir, &vzdq_aquotd_file_operations); + if (de != NULL) + de->proc_iops = &vzdq_aquotd_inode_operations; + else + printk("VZDQ: vz/vzaquota creation failed\n"); + + register_sysctl_glob_paths(fake_path, fake_table, 1); +} + +void vzaquota_fini(void) +{ + remove_proc_entry("vz/vzaquota", NULL); +} diff -urNp linux-2.6.32.48/fs/quota/vzdquota/vzdq_mgmt.c linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_mgmt.c --- linux-2.6.32.48/fs/quota/vzdquota/vzdq_mgmt.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_mgmt.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,754 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* ---------------------------------------------------------------------- + * Switching quota on. + * --------------------------------------------------------------------- */ + +/* + * check limits copied from user + */ +int vzquota_check_sane_limits(struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + + /* softlimit must be less then hardlimit */ + if (qstat->bsoftlimit > qstat->bhardlimit) + goto out; + + if (qstat->isoftlimit > qstat->ihardlimit) + goto out; + + err = 0; +out: + return err; +} + +/* + * check usage values copied from user + */ +int vzquota_check_sane_values(struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + + /* expiration time must not be set if softlimit was not exceeded */ + if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != 0) + goto out; + + if (qstat->icurrent < qstat->isoftlimit && qstat->itime != 0) + goto out; + + err = vzquota_check_sane_limits(qstat); +out: + return err; +} + +/* + * create new quota master block + * this function should: + * - copy limits and usage parameters from user buffer; + * - allock, initialize quota block and insert it to hash; + */ +static int vzquota_create(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + mutex_lock(&vz_quota_mutex); + + err = -EFAULT; + if (!compat) { + if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) + goto out; + compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); + compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); +#endif + } + + err = -EINVAL; + if (quota_id == 0) + goto out; + + if (vzquota_check_sane_values(&qstat.dq_stat)) + goto out; + err = 0; + qmblk = vzquota_alloc_master(quota_id, &qstat); + + if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ + err = PTR_ERR(qmblk); +out: + mutex_unlock(&vz_quota_mutex); + + return err; +} + +/** + * vzquota_on - turn quota on + * + * This function should: + * - find and get refcnt of directory entry for quota root and corresponding + * mountpoint; + * - find corresponding quota block and mark it with given path; + * - check quota tree; + * - initialize quota for the tree root. + */ +static int vzquota_on(unsigned int quota_id, const char __user *quota_root, + char __user *buf) +{ + int err; + struct path path; + struct vz_quota_master *qmblk; + struct super_block *dqsb; + + dqsb = NULL; + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; + + err = user_path(quota_root, &path); + if (err) + goto out; + /* init path must be a directory */ + err = -ENOTDIR; + if (!S_ISDIR(path.dentry->d_inode->i_mode)) + goto out_path; + + qmblk->dq_root_path = path; + qmblk->dq_sb = path.dentry->d_inode->i_sb; + err = vzquota_get_super(qmblk->dq_sb); + if (err) + goto out_super; + + /* + * Serialization with quota initialization and operations is performed + * through generation check: generation is memorized before qmblk is + * found and compared under inode_qmblk_lock with assignment. + * + * Note that the dentry tree is shrunk only for high-level logical + * serialization, purely as a courtesy to the user: to have consistent + * quota statistics, files should be closed etc. on quota on. + */ + err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_path.dentry->d_inode, + qmblk, buf); + if (err) + goto out_init; + qmblk->dq_state = VZDQ_WORKING; + + mutex_unlock(&vz_quota_mutex); + return 0; + +out_init: + dqsb = qmblk->dq_sb; +out_super: + /* clear for qmblk_put/quota_free_master */ + qmblk->dq_sb = NULL; + qmblk->dq_root_path.dentry = NULL; + qmblk->dq_root_path.mnt = NULL; +out_path: + path_put(&path); +out: + if (dqsb) + vzquota_put_super(dqsb); + mutex_unlock(&vz_quota_mutex); + return err; +} + + +/* ---------------------------------------------------------------------- + * Switching quota off. + * --------------------------------------------------------------------- */ + +/* + * destroy quota block by ID + */ +static int vzquota_destroy(unsigned int quota_id) +{ + int err; + struct vz_quota_master *qmblk; + struct path root; + + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state == VZDQ_WORKING) + goto out; /* quota_off first */ + + list_del_init(&qmblk->dq_hash); + root = qmblk->dq_root_path; + qmblk->dq_root_path.dentry = NULL; + qmblk->dq_root_path.mnt = NULL; + + if (qmblk->dq_sb) + vzquota_put_super(qmblk->dq_sb); + mutex_unlock(&vz_quota_mutex); + + qmblk_put(qmblk); + path_put(&root); + return 0; + +out: + mutex_unlock(&vz_quota_mutex); + return err; +} + +/** + * vzquota_off - turn quota off + */ + +static int __vzquota_sync_list(struct list_head *lh, + struct vz_quota_master *qmblk, + enum writeback_sync_modes sync_mode) +{ + struct writeback_control wbc; + LIST_HEAD(list); + struct vz_quota_ilink *qlnk; + struct inode *inode; + int err, ret; + + memset(&wbc, 0, sizeof(wbc)); + wbc.sync_mode = sync_mode; + + err = ret = 0; + while (!list_empty(lh)) { + if (need_resched()) { + inode_qmblk_unlock(qmblk->dq_sb); + schedule(); + inode_qmblk_lock(qmblk->dq_sb); + continue; + } + + qlnk = list_first_entry(lh, struct vz_quota_ilink, list); + list_move(&qlnk->list, &list); + + inode = igrab(QLNK_INODE(qlnk)); + if (!inode) + continue; + + inode_qmblk_unlock(qmblk->dq_sb); + + wbc.nr_to_write = LONG_MAX; + ret = sync_inode(inode, &wbc); + if (ret) + err = ret; + iput(inode); + + inode_qmblk_lock(qmblk->dq_sb); + } + + list_splice(&list, lh); + return err; +} + +static int vzquota_sync_list(struct list_head *lh, + struct vz_quota_master *qmblk) +{ + (void)__vzquota_sync_list(lh, qmblk, WB_SYNC_NONE); + return __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL); +} + +static int vzquota_sync_inodes(struct vz_quota_master *qmblk) +{ + int err; + LIST_HEAD(qlnk_list); + + list_splice_init(&qmblk->dq_ilink_list, &qlnk_list); + err = vzquota_sync_list(&qlnk_list, qmblk); + if (!err && !list_empty(&qmblk->dq_ilink_list)) + err = -EBUSY; + list_splice(&qlnk_list, &qmblk->dq_ilink_list); + + return err; +} + +static int vzquota_off(unsigned int quota_id, char __user *buf, int force) +{ + int err, ret; + struct vz_quota_master *qmblk; + + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EALREADY; + if (qmblk->dq_state != VZDQ_WORKING) + goto out; + + inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */ + ret = vzquota_sync_inodes(qmblk); + inode_qmblk_unlock(qmblk->dq_sb); + + err = vzquota_off_qmblk(qmblk->dq_sb, qmblk, buf, force); + if (err) + goto out; + + err = ret; + /* vzquota_destroy will free resources */ + qmblk->dq_state = VZDQ_STOPING; +out: + mutex_unlock(&vz_quota_mutex); + + return err; +} + + +/* ---------------------------------------------------------------------- + * Other VZQUOTA ioctl's. + * --------------------------------------------------------------------- */ + +/* + * this function should: + * - set new limits/buffer under quota master block lock + * - if new softlimit less then usage, then set expiration time + * - no need to alloc ugid hash table - we'll do that on demand + */ +int vzquota_update_limit(struct dq_stat *_qstat, + struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + if (vzquota_check_sane_limits(qstat)) + goto out; + + err = 0; + + /* limits */ + _qstat->bsoftlimit = qstat->bsoftlimit; + _qstat->bhardlimit = qstat->bhardlimit; + /* + * If the soft limit is exceeded, administrator can override the moment + * when the grace period for limit exceeding ends. + * Specifying the moment may be useful if the soft limit is set to be + * lower than the current usage. In the latter case, if the grace + * period end isn't specified, the grace period will start from the + * moment of the first write operation. + * There is a race with the user level. Soft limit may be already + * exceeded before the limit change, and grace period end calculated by + * the kernel will be overriden. User level may check if the limit is + * already exceeded, but check and set calls are not atomic. + * This race isn't dangerous. Under normal cicrumstances, the + * difference between the grace period end calculated by the kernel and + * the user level should be not greater than as the difference between + * the moments of check and set calls, i.e. not bigger than the quota + * timer resolution - 1 sec. + */ + if (qstat->btime != (time_t)0 && + _qstat->bcurrent >= _qstat->bsoftlimit) + _qstat->btime = qstat->btime; + + _qstat->isoftlimit = qstat->isoftlimit; + _qstat->ihardlimit = qstat->ihardlimit; + if (qstat->itime != (time_t)0 && + _qstat->icurrent >= _qstat->isoftlimit) + _qstat->itime = qstat->itime; + +out: + return err; +} + +/* + * set new quota limits. + * this function should: + * copy new limits from user level + * - find quota block + * - set new limits and flags. + */ +static int vzquota_setlimit(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + mutex_lock(&vz_quota_mutex); /* for hash list protection */ + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (!compat) { + if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) + goto out; + compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); + compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); +#endif + } + + qmblk_data_write_lock(qmblk); + err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat); + if (err == 0) + qmblk->dq_info = qstat.dq_info; + qmblk_data_write_unlock(qmblk); + +out: + mutex_unlock(&vz_quota_mutex); + return err; +} + +/* + * get quota limits. + * very simple - just return stat buffer to user + */ +static int vzquota_getstat(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + qmblk_data_read_lock(qmblk); + /* copy whole buffer under lock */ + memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat)); + memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info)); + qmblk_data_read_unlock(qmblk); + + if (!compat) + err = copy_to_user(u_qstat, &qstat, sizeof(qstat)); + else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + dqstat2compat_dqstat(&qstat.dq_stat, &cqstat.dq_stat); + dqinfo2compat_dqinfo(&qstat.dq_info, &cqstat.dq_info); + err = copy_to_user(u_qstat, &cqstat, sizeof(cqstat)); +#endif + } + if (err) + err = -EFAULT; + +out: + mutex_unlock(&vz_quota_mutex); + return err; +} + +/* + * This is a system call to turn per-VE disk quota on. + * Note this call is allowed to run ONLY from VE0 + */ +long do_vzquotactl(int cmd, unsigned int quota_id, + struct vz_quota_stat __user *qstat, const char __user *ve_root, + int compat) +{ + int ret; + int force = 0; + + ret = -EPERM; + /* access allowed only from root of VE0 */ + if (!capable(CAP_SYS_RESOURCE) || + !capable(CAP_SYS_ADMIN)) + goto out; + + switch (cmd) { + case VZ_DQ_CREATE: + ret = vzquota_create(quota_id, qstat, compat); + break; + case VZ_DQ_DESTROY: + ret = vzquota_destroy(quota_id); + break; + case VZ_DQ_ON: + /* + * qstat is just a pointer to userspace buffer to + * store busy files path in case of vzquota_on fail + */ + ret = vzquota_on(quota_id, ve_root, (char *)qstat); + break; + case VZ_DQ_OFF_FORCED: + force = 1; + case VZ_DQ_OFF: + /* + * ve_root is just a pointer to userspace buffer to + * store busy files path in case of vzquota_off fail + */ + ret = vzquota_off(quota_id, (char *)ve_root, force); + break; + case VZ_DQ_SETLIMIT: + ret = vzquota_setlimit(quota_id, qstat, compat); + break; + case VZ_DQ_GETSTAT: + ret = vzquota_getstat(quota_id, qstat, compat); + break; + + default: + ret = -EINVAL; + goto out; + } + +out: + return ret; +} + + +/* ---------------------------------------------------------------------- + * Proc filesystem routines + * ---------------------------------------------------------------------*/ + +#if defined(CONFIG_PROC_FS) + +#define QUOTA_UINT_LEN 15 +#define QUOTA_TIME_LEN_FMT_UINT "%11u" +#define QUOTA_NUM_LEN_FMT_UINT "%15u" +#define QUOTA_NUM_LEN_FMT_ULL "%15Lu" +#define QUOTA_TIME_LEN_FMT_STR "%11s" +#define QUOTA_NUM_LEN_FMT_STR "%15s" +#define QUOTA_PROC_MAX_LINE_LEN 2048 + +/* + * prints /proc/ve_dq header line + */ +static int print_proc_header(char * buffer) +{ + return sprintf(buffer, + "%-11s" + QUOTA_NUM_LEN_FMT_STR + QUOTA_NUM_LEN_FMT_STR + QUOTA_NUM_LEN_FMT_STR + QUOTA_TIME_LEN_FMT_STR + QUOTA_TIME_LEN_FMT_STR + "\n", + "qid: path", + "usage", "softlimit", "hardlimit", "time", "expire"); +} + +/* + * prints proc master record id, dentry path + */ +static int print_proc_master_id(char * buffer, char * path_buf, + struct vz_quota_master * qp) +{ + char *path; + int over; + + path = NULL; + switch (qp->dq_state) { + case VZDQ_WORKING: + if (!path_buf) { + path = ""; + break; + } + path = d_path(&qp->dq_root_path, path_buf, PAGE_SIZE); + if (IS_ERR(path)) { + path = ""; + break; + } + /* do not print large path, truncate it */ + over = strlen(path) - + (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 - + QUOTA_UINT_LEN); + if (over > 0) { + path += over - 3; + path[0] = path[1] = path[3] = '.'; + } + break; + case VZDQ_STARTING: + path = "-- started --"; + break; + case VZDQ_STOPING: + path = "-- stopped --"; + break; + } + + return sprintf(buffer, "%u: %s\n", qp->dq_id, path); +} + +/* + * prints struct vz_quota_stat data + */ +static int print_proc_stat(char * buffer, struct dq_stat *qs, + struct dq_info *qi) +{ + return sprintf(buffer, + "%11s" + QUOTA_NUM_LEN_FMT_ULL + QUOTA_NUM_LEN_FMT_ULL + QUOTA_NUM_LEN_FMT_ULL + QUOTA_TIME_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + "\n" + "%11s" + QUOTA_NUM_LEN_FMT_UINT + QUOTA_NUM_LEN_FMT_UINT + QUOTA_NUM_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + "\n", + "1k-blocks", + (unsigned long long)qs->bcurrent >> 10, + (unsigned long long)qs->bsoftlimit >> 10, + (unsigned long long)qs->bhardlimit >> 10, + (unsigned int)qs->btime, + (unsigned int)qi->bexpire, + "inodes", + qs->icurrent, + qs->isoftlimit, + qs->ihardlimit, + (unsigned int)qs->itime, + (unsigned int)qi->iexpire); +} + + +/* + * for /proc filesystem output + */ +static int vzquota_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len, i; + off_t printed = 0; + char *p = page; + struct vz_quota_master *qp; + struct vz_quota_ilink *ql2; + struct list_head *listp; + char *path_buf; + + path_buf = (char*)__get_free_page(GFP_KERNEL); + if (path_buf == NULL) + return -ENOMEM; + + len = print_proc_header(p); + printed += len; + if (off < printed) /* keep header in output */ { + *start = p + off; + p += len; + } + + mutex_lock(&vz_quota_mutex); + + /* traverse master hash table for all records */ + for (i = 0; i < vzquota_hash_size; i++) { + list_for_each(listp, &vzquota_hash_table[i]) { + qp = list_entry(listp, + struct vz_quota_master, dq_hash); + + /* Skip other VE's information if not root of VE0 */ + if ((!capable(CAP_SYS_ADMIN) || + !capable(CAP_SYS_RESOURCE))) { + ql2 = INODE_QLNK(current->fs->root.dentry->d_inode); + if (ql2 == NULL || qp != ql2->qmblk) + continue; + } + /* + * Now print the next record + */ + len = 0; + /* we print quotaid and path only in VE0 */ + if (capable(CAP_SYS_ADMIN)) + len += print_proc_master_id(p+len,path_buf, qp); + len += print_proc_stat(p+len, &qp->dq_stat, + &qp->dq_info); + printed += len; + /* skip unnecessary lines */ + if (printed <= off) + continue; + p += len; + /* provide start offset */ + if (*start == NULL) + *start = p + (off - printed); + /* have we printed all requested size? */ + if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN || + (p - *start) >= count) + goto out; + } + } + + *eof = 1; /* checked all hash */ +out: + mutex_unlock(&vz_quota_mutex); + + len = 0; + if (*start != NULL) { + len = (p - *start); + if (len > count) + len = count; + } + + if (path_buf) + free_page((unsigned long) path_buf); + + return len; +} + +/* + * Register procfs read callback + */ +int vzquota_proc_init(void) +{ + struct proc_dir_entry *de; + + de = proc_create("vzquota", S_IFREG|S_IRUSR, proc_vz_dir, NULL); + if (de == NULL) + return -EBUSY; + + de->read_proc = vzquota_read_proc; + de->data = NULL; + return 0; +} + +void vzquota_proc_release(void) +{ + /* Unregister procfs read callback */ + remove_proc_entry("vzquota", proc_vz_dir); +} + +#endif diff -urNp linux-2.6.32.48/fs/quota/vzdquota/vzdq_ops.c linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_ops.c --- linux-2.6.32.48/fs/quota/vzdquota/vzdq_ops.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_ops.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,647 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + */ + +#include +#include +#include +#include +#include +#include + + +/* ---------------------------------------------------------------------- + * Quota superblock operations - helper functions. + * --------------------------------------------------------------------- */ + +static inline void vzquota_incr_inodes(struct dq_stat *dqstat, + unsigned long number) +{ + dqstat->icurrent += number; +} + +static inline void vzquota_incr_space(struct dq_stat *dqstat, + __u64 number) +{ + dqstat->bcurrent += number; +} + +static inline void vzquota_decr_inodes(struct dq_stat *dqstat, + unsigned long number) +{ + if (dqstat->icurrent > number) + dqstat->icurrent -= number; + else + dqstat->icurrent = 0; + if (dqstat->icurrent < dqstat->isoftlimit) + dqstat->itime = (time_t) 0; +} + +static inline void vzquota_decr_space(struct dq_stat *dqstat, + __u64 number) +{ + if (dqstat->bcurrent > number) + dqstat->bcurrent -= number; + else + dqstat->bcurrent = 0; + if (dqstat->bcurrent < dqstat->bsoftlimit) + dqstat->btime = (time_t) 0; +} + +/* + * better printk() message or use /proc/vzquotamsg interface + * similar to /proc/kmsg + */ +static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag, + const char *fmt) +{ + if (dq_info->flags & flag) /* warning already printed for this + masterblock */ + return; + printk(fmt, dq_id); + dq_info->flags |= flag; +} + +/* + * ignore_hardlimit - + * + * Intended to allow superuser of VE0 to overwrite hardlimits. + * + * ignore_hardlimit() has a very bad feature: + * + * writepage() operation for writable mapping of a file with holes + * may trigger get_block() with wrong current and as a consequence, + * opens a possibility to overcommit hardlimits + */ +/* for the reason above, it is disabled now */ +static inline int ignore_hardlimit(struct dq_info *dqstat) +{ +#if 0 + return ve_is_super(get_exec_env()) && + capable(CAP_SYS_RESOURCE) && + (dqstat->options & VZ_QUOTA_OPT_RSQUASH); +#else + return 0; +#endif +} + +static int vzquota_check_inodes(struct dq_info *dq_info, + struct dq_stat *dqstat, + unsigned long number, int dq_id) +{ + if (number == 0) + return QUOTA_OK; + + if (dqstat->icurrent + number > dqstat->ihardlimit && + !ignore_hardlimit(dq_info)) { + vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, + "VZ QUOTA: file hardlimit reached for id=%d\n"); + return NO_QUOTA; + } + + if (dqstat->icurrent + number > dqstat->isoftlimit) { + if (dqstat->itime == (time_t)0) { + vzquota_warn(dq_info, dq_id, 0, + "VZ QUOTA: file softlimit exceeded " + "for id=%d\n"); + dqstat->itime = CURRENT_TIME_SECONDS + + dq_info->iexpire; + } else if (CURRENT_TIME_SECONDS >= dqstat->itime && + !ignore_hardlimit(dq_info)) { + vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, + "VZ QUOTA: file softlimit expired " + "for id=%d\n"); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +static int vzquota_check_space(struct dq_info *dq_info, + struct dq_stat *dqstat, + __u64 number, int dq_id, char prealloc) +{ + if (number == 0) + return QUOTA_OK; + + if (prealloc == DQUOT_CMD_FORCE) + return QUOTA_OK; + + if (dqstat->bcurrent + number > dqstat->bhardlimit && + !ignore_hardlimit(dq_info)) { + if (!prealloc) + vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, + "VZ QUOTA: disk hardlimit reached " + "for id=%d\n"); + return NO_QUOTA; + } + + if (dqstat->bcurrent + number > dqstat->bsoftlimit) { + if (dqstat->btime == (time_t)0) { + if (!prealloc) { + vzquota_warn(dq_info, dq_id, 0, + "VZ QUOTA: disk softlimit exceeded " + "for id=%d\n"); + dqstat->btime = CURRENT_TIME_SECONDS + + dq_info->bexpire; + } else { + /* + * Original Linux quota doesn't allow + * preallocation to exceed softlimit so + * exceeding will be always printed + */ + return NO_QUOTA; + } + } else if (CURRENT_TIME_SECONDS >= dqstat->btime && + !ignore_hardlimit(dq_info)) { + if (!prealloc) + vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, + "VZ QUOTA: disk quota " + "softlimit expired " + "for id=%d\n"); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +#ifdef CONFIG_VZ_QUOTA_UGID +static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + int type, unsigned long number) +{ + struct dq_info *dqinfo; + struct dq_stat *dqstat; + + if (qugid[type] == NULL) + return QUOTA_OK; + if (qugid[type] == VZ_QUOTA_UGBAD) + return NO_QUOTA; + + if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) + return QUOTA_OK; + if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) + return QUOTA_OK; + if (number == 0) + return QUOTA_OK; + + dqinfo = &qmblk->dq_ugid_info[type]; + dqstat = &qugid[type]->qugid_stat; + + if (dqstat->ihardlimit != 0 && + dqstat->icurrent + number > dqstat->ihardlimit) + return NO_QUOTA; + + if (dqstat->isoftlimit != 0 && + dqstat->icurrent + number > dqstat->isoftlimit) { + if (dqstat->itime == (time_t)0) + dqstat->itime = CURRENT_TIME_SECONDS + + dqinfo->iexpire; + else if (CURRENT_TIME_SECONDS >= dqstat->itime) + return NO_QUOTA; + } + + return QUOTA_OK; +} + +static int vzquota_check_ugid_space(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + int type, __u64 number, char prealloc) +{ + struct dq_info *dqinfo; + struct dq_stat *dqstat; + + if (prealloc == DQUOT_CMD_FORCE) + return QUOTA_OK; + + if (qugid[type] == NULL) + return QUOTA_OK; + if (qugid[type] == VZ_QUOTA_UGBAD) + return NO_QUOTA; + + if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) + return QUOTA_OK; + if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) + return QUOTA_OK; + if (number == 0) + return QUOTA_OK; + + dqinfo = &qmblk->dq_ugid_info[type]; + dqstat = &qugid[type]->qugid_stat; + + if (dqstat->bhardlimit != 0 && + dqstat->bcurrent + number > dqstat->bhardlimit) + return NO_QUOTA; + + if (dqstat->bsoftlimit != 0 && + dqstat->bcurrent + number > dqstat->bsoftlimit) { + if (dqstat->btime == (time_t)0) { + if (!prealloc) + dqstat->btime = CURRENT_TIME_SECONDS + + dqinfo->bexpire; + else + /* + * Original Linux quota doesn't allow + * preallocation to exceed softlimit so + * exceeding will be always printed + */ + return NO_QUOTA; + } else if (CURRENT_TIME_SECONDS >= dqstat->btime) + return NO_QUOTA; + } + + return QUOTA_OK; +} +#endif + +/* ---------------------------------------------------------------------- + * Quota superblock operations + * --------------------------------------------------------------------- */ + +/* + * S_NOQUOTA note. + * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for + * - quota file (absent in our case) + * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like + * filesystem-specific new_inode, before the inode gets outside links. + * For the latter case, the only quota operation where care about S_NOQUOTA + * might be required is vzquota_drop, but there S_NOQUOTA has already been + * checked in DQUOT_DROP(). + * So, S_NOQUOTA may be ignored for now in the VZDQ code. + * + * The above note is not entirely correct. + * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from + * delete_inode if new_inode fails (for example, because of inode quota + * limits), so S_NOQUOTA check is needed in free_inode. + * This seems to be the dark corner of the current quota API. + */ + +/* + * Initialize quota operations for the specified inode. + */ +static int vzquota_initialize(struct inode *inode, int type) +{ + vzquota_inode_init_call(inode); + return 0; /* ignored by caller */ +} + +/* + * Release quota for the specified inode. + */ +static int vzquota_drop(struct inode *inode) +{ + vzquota_inode_drop_call(inode); + return 0; /* ignored by caller */ +} + +/* + * Allocate block callback. + * + * If (prealloc) disk quota exceeding warning is not printed. + * See Linux quota to know why. + * + * Return: + * QUOTA_OK == 0 on SUCCESS + * NO_QUOTA == 1 if allocation should fail + */ +static int vzquota_alloc_space(struct inode *inode, + qsize_t number, int prealloc) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + int ret = QUOTA_OK; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid[MAXQUOTAS]; +#endif + + /* checking first */ + ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat, + number, qmblk->dq_id, prealloc); + if (ret == NO_QUOTA) + goto no_quota; +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; + ret = vzquota_check_ugid_space(qmblk, qugid, + cnt, number, prealloc); + if (ret == NO_QUOTA) + goto no_quota; + } + /* check ok, may increment */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (qugid[cnt] == NULL) + continue; + vzquota_incr_space(&qugid[cnt]->qugid_stat, number); + } +#endif + vzquota_incr_space(&qmblk->dq_stat, number); + vzquota_data_unlock(inode, &data); + } + + inode_add_bytes(inode, number); + might_sleep(); + return QUOTA_OK; + +no_quota: + vzquota_data_unlock(inode, &data); + return NO_QUOTA; +} + +/* + * Allocate inodes callback. + * + * Return: + * QUOTA_OK == 0 on SUCCESS + * NO_QUOTA == 1 if allocation should fail + */ +static int vzquota_alloc_inode(const struct inode *inode, qsize_t number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + int ret = QUOTA_OK; + + qmblk = vzquota_inode_data((struct inode *)inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid *qugid[MAXQUOTAS]; +#endif + + /* checking first */ + ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat, + number, qmblk->dq_id); + if (ret == NO_QUOTA) + goto no_quota; +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; + ret = vzquota_check_ugid_inodes(qmblk, qugid, + cnt, number); + if (ret == NO_QUOTA) + goto no_quota; + } + /* check ok, may increment */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (qugid[cnt] == NULL) + continue; + vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number); + } +#endif + vzquota_incr_inodes(&qmblk->dq_stat, number); + vzquota_data_unlock((struct inode *)inode, &data); + } + + might_sleep(); + return QUOTA_OK; + +no_quota: + vzquota_data_unlock((struct inode *)inode, &data); + return NO_QUOTA; +} + +/* + * Free space callback. + */ +static int vzquota_free_space(struct inode *inode, qsize_t number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; /* isn't checked by the caller */ + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + vzquota_decr_space(&qmblk->dq_stat, number); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_space(&qugid->qugid_stat, number); + } +#endif + vzquota_data_unlock(inode, &data); + } + inode_sub_bytes(inode, number); + might_sleep(); + return QUOTA_OK; +} + +/* + * Free inodes callback. + */ +static int vzquota_free_inode(const struct inode *inode, qsize_t number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + qmblk = vzquota_inode_data((struct inode *)inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + vzquota_decr_inodes(&qmblk->dq_stat, number); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_inodes(&qugid->qugid_stat, number); + } +#endif + vzquota_data_unlock((struct inode *)inode, &data); + } + might_sleep(); + return QUOTA_OK; +} + +void vzquota_inode_off(struct inode * inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + /* The call is made through virtinfo, it can be an inode + * not controlled by vzquota. + */ + if (inode->i_sb->dq_op != &vz_quota_operations) + return; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return; + + if (qmblk == NULL) { + /* Tricky place. If qmblk == NULL, it means that this inode + * is not in area controlled by vzquota (except for rare + * case of already set S_NOQUOTA). But we have to set + * S_NOQUOTA in any case because vzquota can be turned + * on later, when this inode is invalid from viewpoint + * of vzquota. + * + * To be safe, we reacquire vzquota lock. + * The assumption is that it would not hurt to call + * vzquota_inode_drop() more than once, but it must + * be called at least once after S_NOQUOTA is set. + */ + inode_qmblk_lock(inode->i_sb); + inode->i_flags |= S_NOQUOTA; + inode_qmblk_unlock(inode->i_sb); + } else { + loff_t bytes = inode_get_bytes(inode); +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + inode->i_flags |= S_NOQUOTA; + + vzquota_decr_space(&qmblk->dq_stat, bytes); + vzquota_decr_inodes(&qmblk->dq_stat, 1); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_space(&qugid->qugid_stat, bytes); + vzquota_decr_inodes(&qugid->qugid_stat, 1); + } +#endif + + vzquota_data_unlock(inode, &data); + } + vzquota_inode_drop_call(inode); +} + + +#ifdef CONFIG_VZ_QUOTA_UGID + +/* + * helper function for quota_transfer + * check that we can add inode to this quota_id + */ +static int vzquota_transfer_check(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + unsigned int type, __u64 size) +{ + if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK || + vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK) + return -1; + return 0; +} + +int vzquota_transfer_usage(struct inode *inode, + int mask, + struct vz_quota_ilink *qlnk) +{ + struct vz_quota_ugid *qugid_old; + __u64 space; + int i; + + space = inode_get_bytes(inode); + for (i = 0; i < MAXQUOTAS; i++) { + if (!(mask & (1 << i))) + continue; + /* + * Do not permit chown a file if its owner does not have + * ugid record. This might happen if we somehow exceeded + * the UID/GID (e.g. set uglimit less than number of users). + */ + if (INODE_QLNK(inode)->qugid[i] == VZ_QUOTA_UGBAD) + return -1; + if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space)) + return -1; + } + + for (i = 0; i < MAXQUOTAS; i++) { + if (!(mask & (1 << i))) + continue; + qugid_old = INODE_QLNK(inode)->qugid[i]; + vzquota_decr_space(&qugid_old->qugid_stat, space); + vzquota_decr_inodes(&qugid_old->qugid_stat, 1); + vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space); + vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1); + } + return 0; +} + +/* + * Transfer the inode between diffent user/group quotas. + */ +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) +{ + return vzquota_inode_transfer_call(inode, iattr) ? + NO_QUOTA : QUOTA_OK; +} + +static void vzquota_swap_inode(struct inode *inode, struct inode *tmpl) +{ + vzquota_inode_swap_call(inode, tmpl); +} + + +#else /* CONFIG_VZ_QUOTA_UGID */ + +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) +{ + return QUOTA_OK; +} + +static void vzquota_swap_inode(struct inode *inode, struct inode *tmpl) +{ +} +#endif + +/* + * Called under following semaphores: + * old_d->d_inode->i_sb->s_vfs_rename_sem + * old_d->d_inode->i_sem + * new_d->d_inode->i_sem + * [not verified --SAW] + */ +static int vzquota_rename(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + return vzquota_rename_check(inode, old_dir, new_dir) ? + NO_QUOTA : QUOTA_OK; +} + +extern void vzquota_shutdown_super(struct super_block *sb); + +/* + * Structure of superblock diskquota operations. + */ +struct dquot_operations vz_quota_operations = { + .initialize = vzquota_initialize, + .drop = vzquota_drop, + .alloc_space = vzquota_alloc_space, + .alloc_inode = vzquota_alloc_inode, + .free_space = vzquota_free_space, + .free_inode = vzquota_free_inode, + .transfer = vzquota_transfer, + .rename = vzquota_rename, + + .swap_inode = vzquota_swap_inode, + .shutdown = vzquota_shutdown_super, +}; diff -urNp linux-2.6.32.48/fs/quota/vzdquota/vzdq_tree.c linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_tree.c --- linux-2.6.32.48/fs/quota/vzdquota/vzdq_tree.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_tree.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,286 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo quota tree implementation + */ + +#include +#include +#include + +struct quotatree_tree *quotatree_alloc(void) +{ + int l; + struct quotatree_tree *tree; + + tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL); + if (tree == NULL) + goto out; + + for (l = 0; l < QUOTATREE_DEPTH; l++) { + INIT_LIST_HEAD(&tree->levels[l].usedlh); + INIT_LIST_HEAD(&tree->levels[l].freelh); + tree->levels[l].freenum = 0; + } + tree->root = NULL; + tree->leaf_num = 0; +out: + return tree; +} + +static struct quotatree_node * +quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level, + struct quotatree_find_state *st) +{ + void **block; + struct quotatree_node *parent; + int l, index; + + parent = NULL; + block = (void **)&tree->root; + l = 0; + while (l < level && *block != NULL) { + index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; + parent = *block; + block = parent->blocks + index; + l++; + } + if (st != NULL) { + st->block = block; + st->level = l; + } + + return parent; +} + +void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st) +{ + quotatree_follow(tree, id, QUOTATREE_DEPTH, st); + if (st->level == QUOTATREE_DEPTH) + return *st->block; + else + return NULL; +} + +void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index) +{ + int i, count; + struct quotatree_node *p; + void *leaf; + + if (QTREE_LEAFNUM(tree) <= index) + return NULL; + + count = 0; + list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { + for (i = 0; i < QUOTATREE_BSIZE; i++) { + leaf = p->blocks[i]; + if (leaf == NULL) + continue; + if (count == index) + return leaf; + count++; + } + } + return NULL; +} + +/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id) + * in the tree... */ +void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id) +{ + int off; + struct quotatree_node *parent, *p; + struct list_head *lh; + + /* get parent refering correct quota tree node of the last level */ + parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL); + if (!parent) + return NULL; + + off = (id & QUOTATREE_BMASK) + 1; /* next ugid */ + lh = &parent->list; + do { + p = list_entry(lh, struct quotatree_node, list); + for ( ; off < QUOTATREE_BSIZE; off++) + if (p->blocks[off]) + return p->blocks[off]; + off = 0; + lh = lh->next; + } while (lh != &QTREE_LEAFLVL(tree)->usedlh); + + return NULL; +} + +int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st, void *data) +{ + struct quotatree_node *p; + int l, index; + + while (st->level < QUOTATREE_DEPTH) { + l = st->level; + if (!list_empty(&tree->levels[l].freelh)) { + p = list_entry(tree->levels[l].freelh.next, + struct quotatree_node, list); + list_del(&p->list); + } else { + p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL); + if (p == NULL) + return -ENOMEM; + /* save block number in the l-level + * it uses for quota file generation */ + p->num = tree->levels[l].freenum++; + } + list_add(&p->list, &tree->levels[l].usedlh); + memset(p->blocks, 0, sizeof(p->blocks)); + *st->block = p; + + index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; + st->block = p->blocks + index; + st->level++; + } + tree->leaf_num++; + *st->block = data; + + return 0; +} + +static struct quotatree_node * +quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id, + int level) +{ + struct quotatree_node *parent; + struct quotatree_find_state st; + + parent = quotatree_follow(tree, id, level, &st); + if (st.level == QUOTATREE_DEPTH) + tree->leaf_num--; + *st.block = NULL; + return parent; +} + +void quotatree_remove(struct quotatree_tree *tree, quotaid_t id) +{ + struct quotatree_node *p; + int level, i; + + p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH); + for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) { + for (i = 0; i < QUOTATREE_BSIZE; i++) + if (p->blocks[i] != NULL) + return; + list_move(&p->list, &tree->levels[level].freelh); + p = quotatree_remove_ptr(tree, id, level); + } +} + +#if 0 +static void quotatree_walk(struct quotatree_tree *tree, + struct quotatree_node *node_start, + quotaid_t id_start, + int level_start, int level_end, + int (*callback)(struct quotatree_tree *, + quotaid_t id, + int level, + void *ptr, + void *data), + void *data) +{ + struct quotatree_node *p; + int l, shift, index; + quotaid_t id; + struct quotatree_find_state st; + + p = node_start; + l = level_start; + shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; + id = id_start; + index = 0; + + /* + * Invariants: + * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; + * id & ((1 << shift) - 1) == 0 + * p is l-level node corresponding to id + */ + do { + if (!p) + break; + + if (l < level_end) { + for (; index < QUOTATREE_BSIZE; index++) + if (p->blocks[index] != NULL) + break; + if (index < QUOTATREE_BSIZE) { + /* descend */ + p = p->blocks[index]; + l++; + shift -= QUOTAID_BBITS; + id += (quotaid_t)index << shift; + index = 0; + continue; + } + } + + if ((*callback)(tree, id, l, p, data)) + break; + + /* ascend and to the next node */ + p = quotatree_follow(tree, id, l, &st); + + index = ((id >> shift) & QUOTATREE_BMASK) + 1; + l--; + shift += QUOTAID_BBITS; + id &= ~(((quotaid_t)1 << shift) - 1); + } while (l >= level_start); +} +#endif + +static void free_list(struct list_head *node_list) +{ + struct quotatree_node *p, *tmp; + + list_for_each_entry_safe(p, tmp, node_list, list) { + list_del(&p->list); + kfree(p); + } +} + +static inline void quotatree_free_nodes(struct quotatree_tree *tree) +{ + int i; + + for (i = 0; i < QUOTATREE_DEPTH; i++) { + free_list(&tree->levels[i].usedlh); + free_list(&tree->levels[i].freelh); + } +} + +static void quotatree_free_leafs(struct quotatree_tree *tree, + void (*dtor)(void *)) +{ + int i; + struct quotatree_node *p; + + list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { + for (i = 0; i < QUOTATREE_BSIZE; i++) { + if (p->blocks[i] == NULL) + continue; + + dtor(p->blocks[i]); + } + } +} + +void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)) +{ + quotatree_free_leafs(tree, dtor); + quotatree_free_nodes(tree); + kfree(tree); +} diff -urNp linux-2.6.32.48/fs/quota/vzdquota/vzdq_ugid.c linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_ugid.c --- linux-2.6.32.48/fs/quota/vzdquota/vzdq_ugid.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdq_ugid.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,1216 @@ +/* + * Copyright (C) 2002 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo UID/GID disk quota implementation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../quotaio_v2.h" +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * XXX + * may be something is needed for sb->s_dquot->info[]? + */ + +#define USRQUOTA_MASK (1 << USRQUOTA) +#define GRPQUOTA_MASK (1 << GRPQUOTA) +#define QTYPE2MASK(type) (1 << (type)) + +static struct kmem_cache *vz_quota_ugid_cachep; + +inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) +{ + if (qugid != VZ_QUOTA_UGBAD) + atomic_inc(&qugid->qugid_count); + return qugid; +} + +/* we don't limit users with zero limits */ +static inline int vzquota_fake_stat(struct dq_stat *stat) +{ + return stat->bhardlimit == 0 && stat->bsoftlimit == 0 && + stat->ihardlimit == 0 && stat->isoftlimit == 0; +} + +/* callback function for quotatree_free() */ +static inline void vzquota_free_qugid(void *ptr) +{ + kmem_cache_free(vz_quota_ugid_cachep, ptr); +} + +/* + * destroy ugid, if it have zero refcount, limits and usage + * must be called under qmblk->dq_mutex + */ +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid) +{ + if (qugid == VZ_QUOTA_UGBAD) + return; + qmblk_data_read_lock(qmblk); + if (atomic_dec_and_test(&qugid->qugid_count) && + (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 && + vzquota_fake_stat(&qugid->qugid_stat) && + qugid->qugid_stat.bcurrent == 0 && + qugid->qugid_stat.icurrent == 0) { + quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type), + qugid->qugid_id); + qmblk->dq_ugid_count--; + vzquota_free_qugid(qugid); + } + qmblk_data_read_unlock(qmblk); +} + +/* + * Get ugid block by its index, like it would present in array. + * In reality, this is not array - this is leafs chain of the tree. + * NULL if index is out of range. + * qmblk semaphore is required to protect the tree. + */ +static inline struct vz_quota_ugid * +vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type) +{ + return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index); +} + +/* + * get next element from ugid "virtual array" + * ugid must be in current array and this array may not be changed between + * two accesses (quaranteed by "stopped" quota state and quota semaphore) + * qmblk semaphore is required to protect the tree + */ +static inline struct vz_quota_ugid * +vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid) +{ + return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type), + qugid->qugid_id); +} + +/* + * requires dq_mutex + */ +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + struct quotatree_tree *tree; + struct quotatree_find_state st; + + tree = QUGID_TREE(qmblk, type); + qugid = quotatree_find(tree, quota_id, &st); + if (qugid) + goto success; + + /* caller does not want alloc */ + if (flags & VZDQUG_FIND_DONT_ALLOC) + goto fail; + + if (flags & VZDQUG_FIND_FAKE) + goto doit; + + /* check limit */ + if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max) + goto fail; + + /* see comment at VZDQUG_FIXED_SET define */ + if (qmblk->dq_flags & VZDQUG_FIXED_SET) + goto fail; + +doit: + /* alloc new structure */ + qugid = kmem_cache_alloc(vz_quota_ugid_cachep, + GFP_NOFS | __GFP_NOFAIL); + if (qugid == NULL) + goto fail; + + /* initialize new structure */ + qugid->qugid_id = quota_id; + memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat)); + qugid->qugid_type = type; + atomic_set(&qugid->qugid_count, 0); + + /* insert in tree */ + if (quotatree_insert(tree, quota_id, &st, qugid) < 0) + goto fail_insert; + qmblk->dq_ugid_count++; + +success: + vzquota_get_ugid(qugid); + return qugid; + +fail_insert: + vzquota_free_qugid(qugid); +fail: + return VZ_QUOTA_UGBAD; +} + +/* + * takes dq_mutex, may schedule + */ +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + + mutex_lock(&qmblk->dq_mutex); + qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); + mutex_unlock(&qmblk->dq_mutex); + + return qugid; +} + +/* + * destroy all ugid records on given quota master + */ +void vzquota_kill_ugid(struct vz_quota_master *qmblk) +{ + BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) || + (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL)); + + if (qmblk->dq_uid_tree != NULL) { + quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid); + quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid); + } +} + + +/* ---------------------------------------------------------------------- + * Management interface to ugid quota for (super)users. + * --------------------------------------------------------------------- */ + +static int vzquota_initialize2(struct inode *inode, int type) +{ + return QUOTA_OK; +} + +static int vzquota_drop2(struct inode *inode) +{ + return QUOTA_OK; +} + +static int vzquota_alloc_space2(struct inode *inode, + qsize_t number, int prealloc) +{ + inode_add_bytes(inode, number); + return QUOTA_OK; +} + +static int vzquota_alloc_inode2(const struct inode *inode, qsize_t number) +{ + return QUOTA_OK; +} + +static int vzquota_free_space2(struct inode *inode, qsize_t number) +{ + inode_sub_bytes(inode, number); + return QUOTA_OK; +} + +static int vzquota_free_inode2(const struct inode *inode, qsize_t number) +{ + return QUOTA_OK; +} + +static int vzquota_transfer2(struct inode *inode, struct iattr *iattr) +{ + return QUOTA_OK; +} + +struct dquot_operations vz_quota_operations2 = { + .initialize = vzquota_initialize2, + .drop = vzquota_drop2, + .alloc_space = vzquota_alloc_space2, + .alloc_inode = vzquota_alloc_inode2, + .free_space = vzquota_free_space2, + .free_inode = vzquota_free_inode2, + .transfer = vzquota_transfer2, +}; + + +asmlinkage long sys_unlink(const char __user * pathname); +asmlinkage long sys_rename(const char __user * oldname, + const char __user * newname); +asmlinkage long sys_symlink(const char __user * oldname, + const char __user * newname); + +/* called under sb->s_umount semaphore */ +static int vz_restore_symlink(struct super_block *sb, char *path, int type) +{ + mm_segment_t oldfs; + char *newpath; + char dest[64]; + const char *names[] = { + [USRQUOTA] "aquota.user", + [GRPQUOTA] "aquota.group" + }; + int err; + + newpath = kmalloc(strlen(path) + sizeof(".new"), GFP_KERNEL); + if (newpath == NULL) + return -ENOMEM; + + strcpy(newpath, path); + strcat(newpath, ".new"); + + sprintf(dest, "/proc/vz/vzaquota/%08x/%s", + new_encode_dev(sb->s_dev), names[type]); + + /* + * Lockdep will learn unneeded dependency while unlink(2): + * ->s_umount => ->i_mutex/1 => ->i_mutex + * Reverse dependency is, + * open_namei() => ->i_mutex => lookup_hash() => __lookup_hash() + * => ->lookup() \eq vzdq_aquotq_lookup() => find_qmblk_by_dev() + * => user_get_super() => ->s_umount + * + * However, first set of ->i_mutex'es belong to /, second to /proc . + * Right fix is to get rid of vz_restore_symlink(), of course. + */ + up_read(&sb->s_umount); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_unlink(newpath); + if (err < 0 && err != -ENOENT) + goto out_restore; + err = sys_symlink(dest, newpath); + if (err < 0) + goto out_restore; + err = sys_rename(newpath, path); +out_restore: + set_fs(oldfs); + + down_read(&sb->s_umount); + /* umounted meanwhile? */ + if (err == 0 && !sb->s_root) + err = -ENODEV; + + kfree(newpath); + return err; +} + +/* called under sb->s_umount semaphore */ +static int vz_quota_on(struct super_block *sb, int type, + int format_id, char *path, int remount) +{ + struct vz_quota_master *qmblk; + int mask2; + int err; + + qmblk = vzquota_find_qmblk(sb); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = vz_restore_symlink(sb, path, type); + if (err < 0) + goto out_put; + + mutex_lock(&vz_quota_mutex); + mask2 = 0; + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; + if (type == USRQUOTA) + mask2 = VZDQ_USRQUOTA; + if (type == GRPQUOTA) + mask2 = VZDQ_GRPQUOTA; + + err = -EBUSY; + if (qmblk->dq_flags & mask2) + goto out_sem; + + err = 0; + qmblk->dq_flags |= mask2; + sb->s_dquot.flags |= dquot_state_flag( + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, type); + +out_sem: + mutex_unlock(&vz_quota_mutex); +out_put: + qmblk_put(qmblk); +out: + return err; +} + +static int vz_quota_off(struct super_block *sb, int type, int remount) +{ + struct vz_quota_master *qmblk; + int mask2; + int err; + + qmblk = vzquota_find_qmblk(sb); + mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + mask2 = 0; + if (type == USRQUOTA) + mask2 = VZDQ_USRQUOTA; + if (type == GRPQUOTA) + mask2 = VZDQ_GRPQUOTA; + err = -EINVAL; + if (!(qmblk->dq_flags & mask2)) + goto out; + + qmblk->dq_flags &= ~mask2; + err = 0; + +out: + mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +static int vz_quota_sync(struct super_block *sb, int type) +{ + return 0; /* vz quota is always uptodate */ +} + +static int vz_get_dqblk(struct super_block *sb, int type, + qid_t id, struct if_dqblk *di) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid *ugid; + int err; + + qmblk = vzquota_find_qmblk(sb); + mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = 0; + ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC); + if (ugid != VZ_QUOTA_UGBAD) { + qmblk_data_read_lock(qmblk); + di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10; + di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10; + di->dqb_curspace = ugid->qugid_stat.bcurrent; + di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit; + di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit; + di->dqb_curinodes = ugid->qugid_stat.icurrent; + di->dqb_btime = ugid->qugid_stat.btime; + di->dqb_itime = ugid->qugid_stat.itime; + qmblk_data_read_unlock(qmblk); + di->dqb_valid = QIF_ALL; + vzquota_put_ugid(qmblk, ugid); + } else { + memset(di, 0, sizeof(*di)); + di->dqb_valid = QIF_ALL; + } + +out: + mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +/* must be called under vz_quota_mutex */ +static int __vz_set_dqblk(struct vz_quota_master *qmblk, + int type, qid_t id, struct if_dqblk *di) +{ + struct vz_quota_ugid *ugid; + + ugid = vzquota_find_ugid(qmblk, id, type, 0); + if (ugid == VZ_QUOTA_UGBAD) + return -ESRCH; + + qmblk_data_write_lock(qmblk); + /* + * Subtle compatibility breakage. + * + * Some old non-vz kernel quota didn't start grace period + * if the new soft limit happens to be below the usage. + * Non-vz kernel quota in 2.4.20 starts the grace period + * (if it hasn't been started). + * Current non-vz kernel performs even more complicated + * manipulations... + * + * Also, current non-vz kernels have inconsistency related to + * the grace time start. In regular operations the grace period + * is started if the usage is greater than the soft limit (and, + * strangely, is cancelled if the usage is less). + * However, set_dqblk starts the grace period if the usage is greater + * or equal to the soft limit. + * + * Here we try to mimic the behavior of the current non-vz kernel. + */ + if (di->dqb_valid & QIF_BLIMITS) { + ugid->qugid_stat.bhardlimit = + (__u64)di->dqb_bhardlimit << 10; + ugid->qugid_stat.bsoftlimit = + (__u64)di->dqb_bsoftlimit << 10; + if (di->dqb_bsoftlimit == 0 || + ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit) + ugid->qugid_stat.btime = 0; + else if (!(di->dqb_valid & QIF_BTIME)) + ugid->qugid_stat.btime = CURRENT_TIME_SECONDS + + qmblk->dq_ugid_info[type].bexpire; + else + ugid->qugid_stat.btime = di->dqb_btime; + } + if (di->dqb_valid & QIF_ILIMITS) { + ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit; + ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit; + if (di->dqb_isoftlimit == 0 || + ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit) + ugid->qugid_stat.itime = 0; + else if (!(di->dqb_valid & QIF_ITIME)) + ugid->qugid_stat.itime = CURRENT_TIME_SECONDS + + qmblk->dq_ugid_info[type].iexpire; + else + ugid->qugid_stat.itime = di->dqb_itime; + } + qmblk_data_write_unlock(qmblk); + vzquota_put_ugid(qmblk, ugid); + + return 0; +} + +static int vz_set_dqblk(struct super_block *sb, int type, + qid_t id, struct if_dqblk *di) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + err = __vz_set_dqblk(qmblk, type, id, di); +out: + mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +static int vz_get_dqinfo(struct super_block *sb, int type, + struct if_dqinfo *ii) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = 0; + ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire; + ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire; + ii->dqi_flags = 0; + ii->dqi_valid = IIF_ALL; + +out: + mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +/* must be called under vz_quota_mutex */ +static int __vz_set_dqinfo(struct vz_quota_master *qmblk, + int type, struct if_dqinfo *ii) +{ + if (ii->dqi_valid & IIF_FLAGS) + if (ii->dqi_flags & DQF_MASK) + return -EINVAL; + + if (ii->dqi_valid & IIF_BGRACE) + qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace; + if (ii->dqi_valid & IIF_IGRACE) + qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace; + return 0; +} + +static int vz_set_dqinfo(struct super_block *sb, int type, + struct if_dqinfo *ii) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + err = __vz_set_dqinfo(qmblk, type, ii); +out: + mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +#ifdef CONFIG_QUOTA_COMPAT + +#define Q_GETQUOTI_SIZE 1024 + +#define UGID2DQBLK(dst, src) \ + do { \ + (dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \ + (dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \ + (dst)->dqb_curinodes = (src)->qugid_stat.icurrent; \ + /* in 1K blocks */ \ + (dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \ + /* in 1K blocks */ \ + (dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \ + /* in bytes, 64 bit */ \ + (dst)->dqb_curspace = (src)->qugid_stat.bcurrent; \ + (dst)->dqb_btime = (src)->qugid_stat.btime; \ + (dst)->dqb_itime = (src)->qugid_stat.itime; \ + } while (0) + +static int vz_get_quoti(struct super_block *sb, int type, qid_t idx, + struct v2_disk_dqblk __user *dqblk) +{ + struct vz_quota_master *qmblk; + struct v2_disk_dqblk *data, *kbuf; + struct vz_quota_ugid *ugid; + int count; + int err; + + qmblk = vzquota_find_qmblk(sb); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = -ENOMEM; + kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf)); + if (!kbuf) + goto out; + + mutex_lock(&vz_quota_mutex); + mutex_lock(&qmblk->dq_mutex); + for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; + ugid != NULL && count < Q_GETQUOTI_SIZE; + count++) + { + data = kbuf + count; + qmblk_data_read_lock(qmblk); + UGID2DQBLK(data, ugid); + qmblk_data_read_unlock(qmblk); + data->dqb_id = ugid->qugid_id; + + /* Find next entry */ + ugid = vzquota_get_next(qmblk, ugid); + BUG_ON(ugid != NULL && ugid->qugid_type != type); + } + mutex_unlock(&qmblk->dq_mutex); + mutex_unlock(&vz_quota_mutex); + + err = count; + if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf))) + err = -EFAULT; + + vfree(kbuf); +out: + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + + return err; +} + +#endif + +struct quotactl_ops vz_quotactl_operations = { + .quota_on = vz_quota_on, + .quota_off = vz_quota_off, + .quota_sync = vz_quota_sync, + .get_info = vz_get_dqinfo, + .set_info = vz_set_dqinfo, + .get_dqblk = vz_get_dqblk, + .set_dqblk = vz_set_dqblk, +#ifdef CONFIG_QUOTA_COMPAT + .get_quoti = vz_get_quoti, +#endif +}; + + +/* ---------------------------------------------------------------------- + * Management interface for host system admins. + * --------------------------------------------------------------------- */ + +static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size, + struct vz_quota_iface __user *u_ugid_buf, int compat) +{ + struct vz_quota_master *qmblk; + int ret; + + mutex_lock(&vz_quota_mutex); + + ret = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + ret = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; /* working quota doesn't accept new ugids */ + + ret = 0; + /* start to add ugids */ + for (ret = 0; ret < ugid_size; ret++) { + struct vz_quota_iface ugid_buf; + struct vz_quota_ugid *ugid; + + if (!compat) { + if (copy_from_user(&ugid_buf, u_ugid_buf, + sizeof(ugid_buf))) + break; + u_ugid_buf++; /* next user buffer */ + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_iface oqif; + if (copy_from_user(&oqif, u_ugid_buf, + sizeof(oqif))) + break; + ugid_buf.qi_id = oqif.qi_id; + ugid_buf.qi_type = oqif.qi_type; + compat_dqstat2dqstat(&oqif.qi_stat, &ugid_buf.qi_stat); + u_ugid_buf = (struct vz_quota_iface __user *) + (((void *)u_ugid_buf) + sizeof(oqif)); +#endif + } + + if (ugid_buf.qi_type >= MAXQUOTAS) + break; /* bad quota type - this is the only check */ + + ugid = vzquota_find_ugid(qmblk, + ugid_buf.qi_id, ugid_buf.qi_type, 0); + if (ugid == VZ_QUOTA_UGBAD) { + qmblk->dq_flags |= VZDQUG_FIXED_SET; + break; /* limit reached */ + } + + /* update usage/limits + * we can copy the data without the lock, because the data + * cannot be modified in VZDQ_STARTING state */ + ugid->qugid_stat = ugid_buf.qi_stat; + + vzquota_put_ugid(qmblk, ugid); + } +out: + mutex_unlock(&vz_quota_mutex); + + return ret; +} + +static int quota_ugid_setgrace(unsigned int quota_id, + struct dq_info __user u_dq_info[], int compat) +{ + struct vz_quota_master *qmblk; + struct dq_info dq_info[MAXQUOTAS]; + struct dq_info *target; + int err, type; + + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; /* working quota doesn't accept changing options */ + + err = -EFAULT; + if (!compat) { + if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_dq_info odqi[MAXQUOTAS]; + if (copy_from_user(odqi, u_dq_info, sizeof(odqi))) + goto out; + for (type = 0; type < MAXQUOTAS; type++) + compat_dqinfo2dqinfo(&odqi[type], &dq_info[type]); +#endif + } + + err = 0; + + /* update in qmblk */ + for (type = 0; type < MAXQUOTAS; type++) { + target = &qmblk->dq_ugid_info[type]; + target->bexpire = dq_info[type].bexpire; + target->iexpire = dq_info[type].iexpire; + } +out: + mutex_unlock(&vz_quota_mutex); + + return err; +} + +static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size, + struct vz_quota_iface *u_ugid_buf) +{ + int type, count; + struct vz_quota_ugid *ugid; + + if (QTREE_LEAFNUM(qmblk->dq_uid_tree) + + QTREE_LEAFNUM(qmblk->dq_gid_tree) + <= index) + return 0; + + count = 0; + + type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA; + if (type == GRPQUOTA) + index -= QTREE_LEAFNUM(qmblk->dq_uid_tree); + + /* loop through ugid and then qgid quota */ +repeat: + for (ugid = vzquota_get_byindex(qmblk, index, type); + ugid != NULL && count < size; + ugid = vzquota_get_next(qmblk, ugid), count++) + { + struct vz_quota_iface ugid_buf; + + /* form interface buffer and send in to user-level */ + qmblk_data_read_lock(qmblk); + memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat, + sizeof(ugid_buf.qi_stat)); + qmblk_data_read_unlock(qmblk); + ugid_buf.qi_id = ugid->qugid_id; + ugid_buf.qi_type = ugid->qugid_type; + + memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf)); + u_ugid_buf++; /* next portion of user buffer */ + } + + if (type == USRQUOTA && count < size) { + type = GRPQUOTA; + index = 0; + goto repeat; + } + + return count; +} + +static int quota_ugid_getstat(unsigned int quota_id, + int index, int size, struct vz_quota_iface __user *u_ugid_buf, + int compat) +{ + struct vz_quota_master *qmblk; + struct vz_quota_iface *k_ugid_buf; + int err; + + if (index < 0 || size < 0) + return -EINVAL; + + if (size > INT_MAX / sizeof(struct vz_quota_iface)) + return -EINVAL; + + k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface)); + if (k_ugid_buf == NULL) + return -ENOMEM; + + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + mutex_lock(&qmblk->dq_mutex); + err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf); + mutex_unlock(&qmblk->dq_mutex); + if (err < 0) + goto out; + + if (!compat) { + if (copy_to_user(u_ugid_buf, k_ugid_buf, + err * sizeof(struct vz_quota_iface))) + err = -EFAULT; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_iface oqif; + int i; + for (i = 0; i < err; i++) { + oqif.qi_id = k_ugid_buf[i].qi_id; + oqif.qi_type = k_ugid_buf[i].qi_type; + dqstat2compat_dqstat(&k_ugid_buf[i].qi_stat, + &oqif.qi_stat); + if (copy_to_user(u_ugid_buf, &oqif, sizeof(oqif))) + err = -EFAULT; + u_ugid_buf = (struct vz_quota_iface __user *) + (((void *)u_ugid_buf) + sizeof(oqif)); + } +#endif + } + +out: + mutex_unlock(&vz_quota_mutex); + vfree(k_ugid_buf); + return err; +} + +static int quota_ugid_getgrace(unsigned int quota_id, + struct dq_info __user u_dq_info[], int compat) +{ + struct vz_quota_master *qmblk; + struct dq_info dq_info[MAXQUOTAS]; + struct dq_info *target; + int err, type; + + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = 0; + /* update from qmblk */ + for (type = 0; type < MAXQUOTAS; type ++) { + target = &qmblk->dq_ugid_info[type]; + dq_info[type].bexpire = target->bexpire; + dq_info[type].iexpire = target->iexpire; + dq_info[type].flags = target->flags; + } + + if (!compat) { + if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info))) + err = -EFAULT; + } else { +#ifdef CONFIG_COMPAT + struct compat_dq_info odqi[MAXQUOTAS]; + for (type = 0; type < MAXQUOTAS; type ++) + dqinfo2compat_dqinfo(&dq_info[type], &odqi[type]); + if (copy_to_user(u_dq_info, odqi, sizeof(odqi))) + err = -EFAULT; +#endif + } +out: + mutex_unlock(&vz_quota_mutex); + + return err; +} + +static int quota_ugid_getconfig(unsigned int quota_id, + struct vz_quota_ugid_stat __user *info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_stat kinfo; + int err; + + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = 0; + kinfo.limit = qmblk->dq_ugid_max; + kinfo.count = qmblk->dq_ugid_count; + kinfo.flags = qmblk->dq_flags; + + if (copy_to_user(info, &kinfo, sizeof(kinfo))) + err = -EFAULT; +out: + mutex_unlock(&vz_quota_mutex); + + return err; +} + +static int quota_ugid_setconfig(unsigned int quota_id, + struct vz_quota_ugid_stat __user *info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_stat kinfo; + int err; + + mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&kinfo, info, sizeof(kinfo))) + goto out; + + err = 0; + qmblk->dq_ugid_max = kinfo.limit; + if (qmblk->dq_state == VZDQ_STARTING) { + qmblk->dq_flags = kinfo.flags; + if (qmblk->dq_flags & VZDQUG_ON) + qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA; + } + +out: + mutex_unlock(&vz_quota_mutex); + + return err; +} + +static int quota_ugid_setlimit(unsigned int quota_id, + struct vz_quota_ugid_setlimit __user *u_lim) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_setlimit lim; + int err; + + mutex_lock(&vz_quota_mutex); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&lim, u_lim, sizeof(lim))) + goto out; + + err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); + +out: + mutex_unlock(&vz_quota_mutex); + + return err; +} + +static int quota_ugid_setinfo(unsigned int quota_id, + struct vz_quota_ugid_setinfo __user *u_info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_setinfo info; + int err; + + mutex_lock(&vz_quota_mutex); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&info, u_info, sizeof(info))) + goto out; + + err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); + +out: + mutex_unlock(&vz_quota_mutex); + + return err; +} + +/* + * This is a system call to maintain UGID quotas + * Note this call is allowed to run ONLY from VE0 + */ +long do_vzquotaugidctl(int cmd, unsigned int quota_id, + unsigned int ugid_index, unsigned int ugid_size, + void *addr, int compat) +{ + int ret; + + ret = -EPERM; + /* access allowed only from root of VE0 */ + if (!capable(CAP_SYS_RESOURCE) || + !capable(CAP_SYS_ADMIN)) + goto out; + + switch (cmd) { + case VZ_DQ_UGID_GETSTAT: + ret = quota_ugid_getstat(quota_id, + ugid_index, ugid_size, + (struct vz_quota_iface __user *)addr, + compat); + break; + case VZ_DQ_UGID_ADDSTAT: + ret = quota_ugid_addstat(quota_id, ugid_size, + (struct vz_quota_iface __user *) addr, + compat); + break; + case VZ_DQ_UGID_GETGRACE: + ret = quota_ugid_getgrace(quota_id, + (struct dq_info __user *)addr, compat); + break; + case VZ_DQ_UGID_SETGRACE: + ret = quota_ugid_setgrace(quota_id, + (struct dq_info __user *)addr, compat); + break; + case VZ_DQ_UGID_GETCONFIG: + ret = quota_ugid_getconfig(quota_id, + (struct vz_quota_ugid_stat __user *) + addr); + break; + case VZ_DQ_UGID_SETCONFIG: + ret = quota_ugid_setconfig(quota_id, + (struct vz_quota_ugid_stat __user *) + addr); + break; + case VZ_DQ_UGID_SETLIMIT: + ret = quota_ugid_setlimit(quota_id, + (struct vz_quota_ugid_setlimit __user *) + addr); + break; + case VZ_DQ_UGID_SETINFO: + ret = quota_ugid_setinfo(quota_id, + (struct vz_quota_ugid_setinfo __user *) + addr); + break; + default: + ret = -EINVAL; + goto out; + } +out: + return ret; +} + +static void ugid_quota_on_sb(struct super_block *sb) +{ + struct super_block *real_sb; + struct vz_quota_master *qmblk; + + if (!sb->s_op->get_quota_root) + return; + + real_sb = sb->s_op->get_quota_root(sb)->i_sb; + if (real_sb->dq_op != &vz_quota_operations) + return; + + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; + + qmblk = vzquota_find_qmblk(sb); + if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) + return; + mutex_lock(&vz_quota_mutex); + if (qmblk->dq_flags & VZDQ_USRQUOTA) + sb->s_dquot.flags |= dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, USRQUOTA); + if (qmblk->dq_flags & VZDQ_GRPQUOTA) + sb->s_dquot.flags |= dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, GRPQUOTA); + mutex_unlock(&vz_quota_mutex); + qmblk_put(qmblk); +} + +static void ugid_quota_off_sb(struct super_block *sb) +{ + /* can't make quota off on mounted super block */ + BUG_ON(sb->s_root != NULL); +} + +static int ugid_notifier_call(struct vnotifier_block *self, + unsigned long n, void *data, int old_ret) +{ + struct virt_info_quota *viq; + + viq = (struct virt_info_quota *)data; + + switch (n) { + case VIRTINFO_QUOTA_ON: + ugid_quota_on_sb(viq->super); + break; + case VIRTINFO_QUOTA_OFF: + ugid_quota_off_sb(viq->super); + break; + case VIRTINFO_QUOTA_GETSTAT: + break; + default: + return old_ret; + } + return NOTIFY_OK; +} + +static struct vnotifier_block ugid_notifier_block = { + .notifier_call = ugid_notifier_call, +}; + +/* ---------------------------------------------------------------------- + * Init/exit. + * --------------------------------------------------------------------- */ + +int vzquota_ugid_init(void) +{ + int err; + + vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid", + sizeof(struct vz_quota_ugid), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (vz_quota_ugid_cachep == NULL) + goto err_slab; + + err = register_quota_format(&vz_quota_empty_v2_format); + if (err) + goto err_reg; + + virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block); + return 0; + +err_reg: + kmem_cache_destroy(vz_quota_ugid_cachep); + return err; + +err_slab: + printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); + return -ENOMEM; +} + +void vzquota_ugid_release(void) +{ + virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block); + unregister_quota_format(&vz_quota_empty_v2_format); + + kmem_cache_destroy(vz_quota_ugid_cachep); +} diff -urNp linux-2.6.32.48/fs/quota/vzdquota/vzdquot.c linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdquot.c --- linux-2.6.32.48/fs/quota/vzdquota/vzdquot.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/quota/vzdquota/vzdquot.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,1994 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains the core of Virtuozzo disk quota implementation: + * maintenance of VZDQ information in inodes, + * external interfaces, + * module entry. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ---------------------------------------------------------------------- + * + * Locking + * + * ---------------------------------------------------------------------- */ + +/* + * Serializes on/off and all other do_vzquotactl operations. + * Protects qmblk hash. + */ +struct mutex vz_quota_mutex; + +/* + * Data access locks + * inode_qmblk + * protects qmblk pointers in all inodes and qlnk content in general + * (but not qmblk content); + * also protects related qmblk invalidation procedures; + * can't be per-inode because of vzquota_dtree_qmblk complications + * and problems with serialization with quota_on, + * but can be per-superblock; + * qmblk_data + * protects qmblk fields (such as current usage) + * quota_data + * protects charge/uncharge operations, thus, implies + * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock + * (to protect ugid pointers). + * + * Lock order: + * inode_qmblk_lock -> dcache_lock + * inode_qmblk_lock -> qmblk_data + */ +static DEFINE_SPINLOCK(vzdq_qmblk_lock); + +inline void inode_qmblk_lock(struct super_block *sb) +{ + spin_lock(&vzdq_qmblk_lock); +} + +inline void inode_qmblk_unlock(struct super_block *sb) +{ + spin_unlock(&vzdq_qmblk_lock); +} + +inline void qmblk_data_read_lock(struct vz_quota_master *qmblk) +{ + spin_lock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk) +{ + spin_unlock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_write_lock(struct vz_quota_master *qmblk) +{ + spin_lock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk) +{ + spin_unlock(&qmblk->dq_data_lock); +} + +struct quota_format_type vz_quota_empty_v2_format = { + .qf_fmt_id = QFMT_VFS_V0, + .qf_ops = NULL, + .qf_owner = THIS_MODULE, +}; + +/* ---------------------------------------------------------------------- + * + * Master hash table handling. + * + * SMP not safe, serialied by vz_quota_mutex within quota syscalls + * + * --------------------------------------------------------------------- */ + +static struct kmem_cache *vzquota_cachep; + +/* + * Hash function. + */ +#define QHASH_BITS 6 +#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS) +#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1) + +struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE]; +int vzquota_hash_size = VZ_QUOTA_HASH_SIZE; + +static inline int vzquota_hash_func(unsigned int qid) +{ + return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK); +} + +/** + * vzquota_alloc_master - alloc and instantiate master quota record + * + * Returns: + * pointer to newly created record if SUCCESS + * -ENOMEM if out of memory + * -EEXIST if record with given quota_id already exist + */ +struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, + struct vz_quota_stat *qstat) +{ + int err; + struct vz_quota_master *qmblk; + + err = -EEXIST; + if (vzquota_find_master(quota_id) != NULL) + goto out; + + err = -ENOMEM; + qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL); + if (qmblk == NULL) + goto out; +#ifdef CONFIG_VZ_QUOTA_UGID + qmblk->dq_uid_tree = quotatree_alloc(); + if (!qmblk->dq_uid_tree) + goto out_free; + + qmblk->dq_gid_tree = quotatree_alloc(); + if (!qmblk->dq_gid_tree) + goto out_free_tree; +#endif + + qmblk->dq_state = VZDQ_STARTING; + mutex_init(&qmblk->dq_mutex); + spin_lock_init(&qmblk->dq_data_lock); + + qmblk->dq_id = quota_id; + qmblk->dq_stat = qstat->dq_stat; + qmblk->dq_info = qstat->dq_info; + qmblk->dq_root_path.dentry = NULL; + qmblk->dq_root_path.mnt = NULL; + qmblk->dq_sb = NULL; + qmblk->dq_ugid_count = 0; + qmblk->dq_ugid_max = 0; + qmblk->dq_flags = 0; + memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info)); + INIT_LIST_HEAD(&qmblk->dq_ilink_list); + + atomic_set(&qmblk->dq_count, 1); + + /* insert in hash chain */ + list_add(&qmblk->dq_hash, + &vzquota_hash_table[vzquota_hash_func(quota_id)]); + + /* success */ + return qmblk; + +#ifdef CONFIG_VZ_QUOTA_UGID +out_free_tree: + quotatree_free(qmblk->dq_uid_tree, NULL); +out_free: + kmem_cache_free(vzquota_cachep, qmblk); +#endif +out: + return ERR_PTR(err); +} + +static struct vz_quota_master *vzquota_alloc_fake(void) +{ + struct vz_quota_master *qmblk; + + qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL); + if (qmblk == NULL) + return NULL; + memset(qmblk, 0, sizeof(*qmblk)); + qmblk->dq_state = VZDQ_STOPING; + qmblk->dq_flags = VZDQ_NOQUOT; + spin_lock_init(&qmblk->dq_data_lock); + INIT_LIST_HEAD(&qmblk->dq_ilink_list); + atomic_set(&qmblk->dq_count, 1); + return qmblk; +} + +/** + * vzquota_find_master - find master record with given id + * + * Returns qmblk without touching its refcounter. + * Called under vz_quota_mutex. + */ +struct vz_quota_master *vzquota_find_master(unsigned int quota_id) +{ + int i; + struct vz_quota_master *qp; + + i = vzquota_hash_func(quota_id); + list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) { + if (qp->dq_id == quota_id) + return qp; + } + return NULL; +} + +/** + * vzquota_free_master - release resources taken by qmblk, freeing memory + * + * qmblk is assumed to be already taken out from the hash. + * Should be called outside vz_quota_mutex. + */ +void vzquota_free_master(struct vz_quota_master *qmblk) +{ +#ifdef CONFIG_VZ_QUOTA_UGID + vzquota_kill_ugid(qmblk); +#endif + BUG_ON(!list_empty(&qmblk->dq_ilink_list)); + kmem_cache_free(vzquota_cachep, qmblk); +} + + +/* ---------------------------------------------------------------------- + * + * Passing quota information through current + * + * Used in inode -> qmblk lookup at inode creation stage (since at that + * time there are no links between the inode being created and its parent + * directory). + * + * --------------------------------------------------------------------- */ + +#define VZDQ_CUR_MAGIC 0x57d0fee2 + +static inline int vzquota_cur_qmblk_check(void) +{ + return current->magic == VZDQ_CUR_MAGIC; +} + +static inline struct inode *vzquota_cur_qmblk_fetch(void) +{ + return current->ino; +} + +static inline void vzquota_cur_qmblk_set(struct inode *data) +{ + struct task_struct *tsk; + + tsk = current; + tsk->magic = VZDQ_CUR_MAGIC; + tsk->ino = data; +} + +#if 0 +static inline void vzquota_cur_qmblk_reset(void) +{ + current->magic = 0; +} +#endif + + +/* ---------------------------------------------------------------------- + * + * Superblock quota operations + * + * --------------------------------------------------------------------- */ + +/* + * Kernel structure abuse. + * We use files[0] pointer as an int variable: + * reference counter of how many quota blocks uses this superblock. + * files[1] is used for generations structure which helps us to track + * when traversing of dentries is really required. + */ +#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master +#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\ + &sb->s_dquot.dqio_mutex) + +#if defined(VZ_QUOTA_UNLOAD) + +#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count + +struct dquot_operations *orig_dq_op; +struct quotactl_ops *orig_dq_cop; + +/** + * quota_get_super - account for new a quoted tree under the superblock + * + * One superblock can have multiple directory subtrees with different VZ + * quotas. We keep a counter of such subtrees and set VZ quota operations or + * reset the default ones. + * + * Called under vz_quota_mutex (from quota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + if (sb->dq_op != &vz_quota_operations) { + down(&sb->s_dquot.dqonoff_sem); + if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) { + up(&sb->s_dquot.dqonoff_sem); + return -EEXIST; + } + if (orig_dq_op == NULL && sb->dq_op != NULL) + orig_dq_op = sb->dq_op; + sb->dq_op = &vz_quota_operations; + if (orig_dq_cop == NULL && sb->s_qcop != NULL) + orig_dq_cop = sb->s_qcop; + /* XXX this may race with sys_quotactl */ +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; +#else + sb->s_qcop = NULL; +#endif + do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; + /* + * To get quotaops.h call us we need to mark superblock + * as having quota. These flags mark the moment when + * our dq_op start to be called. + * + * The ordering of dq_op and s_dquot.flags assignment + * needs to be enforced, but other CPUs do not do rmb() + * between s_dquot.flags and dq_op accesses. + */ + wmb(); synchronize_sched(); + sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; + __module_get(THIS_MODULE); + up(&sb->s_dquot.dqonoff_sem); + } + /* protected by vz_quota_mutex */ + __VZ_QUOTA_SBREF(sb)++; + return 0; +} + +/** + * quota_put_super - release superblock when one quota tree goes away + * + * Called under vz_quota_mutex. + */ +void vzquota_put_super(struct super_block *sb) +{ + int count; + + count = --__VZ_QUOTA_SBREF(sb); + if (count == 0) { + down(&sb->s_dquot.dqonoff_sem); + sb->s_dquot.flags = 0; + wmb(); synchronize_sched(); + sema_init(&sb->s_dquot.dqio_sem, 1); + sb->s_qcop = orig_dq_cop; + sb->dq_op = orig_dq_op; + inode_qmblk_lock(sb); + quota_gen_put(SB_QGEN(sb)); + SB_QGEN(sb) = NULL; + /* release qlnk's without qmblk */ + remove_inode_quota_links_list(&non_vzquota_inodes_lh, + sb, NULL); + /* + * Races with quota initialization: + * after this inode_qmblk_unlock all inode's generations are + * invalidated, quota_inode_qmblk checks superblock operations. + */ + inode_qmblk_unlock(sb); + /* + * Module refcounting: in theory, this is the best place + * to call module_put(THIS_MODULE). + * In reality, it can't be done because we can't be sure that + * other CPUs do not enter our code segment through dq_op + * cached long time ago. Quotaops interface isn't supposed to + * go into modules currently (that is, into unloadable + * modules). By omitting module_put, our module isn't + * unloadable. + */ + up(&sb->s_dquot.dqonoff_sem); + } +} + +#else + +/** + * vzquota_shutdown_super - callback on umount + */ +void vzquota_shutdown_super(struct super_block *sb) +{ + struct vz_quota_master *qmblk; + + qmblk = __VZ_QUOTA_NOQUOTA(sb); + __VZ_QUOTA_NOQUOTA(sb) = NULL; + if (qmblk != NULL) + qmblk_put(qmblk); +} + +/** + * vzquota_get_super - account for new a quoted tree under the superblock + * + * One superblock can have multiple directory subtrees with different VZ + * quotas. + * + * Called under vz_quota_mutex (from vzquota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + struct vz_quota_master *qnew; + int err; + + mutex_lock(&sb->s_dquot.dqonoff_mutex); + err = -EEXIST; + if (sb_any_quota_loaded(sb) && sb->dq_op != &vz_quota_operations) + goto out_up; + + /* + * This allocation code should be under sb->dq_op check below, but + * it doesn't really matter... + */ + if (__VZ_QUOTA_NOQUOTA(sb) == NULL) { + qnew = vzquota_alloc_fake(); + if (qnew == NULL) + goto out_up; + __VZ_QUOTA_NOQUOTA(sb) = qnew; + } + + if (sb->dq_op != &vz_quota_operations) { + sb->dq_op = &vz_quota_operations; +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; +#else + sb->s_qcop = NULL; +#endif + do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); + + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + /* these 2 list heads are checked in sync_dquots() */ + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = + &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = + &vz_quota_empty_v2_format; + + /* + * To get quotaops.h to call us we need to mark superblock + * as having quota. These flags mark the moment when + * our dq_op start to be called. + * + * The ordering of dq_op and s_dquot.flags assignment + * needs to be enforced, but other CPUs do not do rmb() + * between s_dquot.flags and dq_op accesses. + */ + wmb(); synchronize_sched(); + sb->s_dquot.flags = + dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, + USRQUOTA) | + dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, + GRPQUOTA); + } + err = 0; + +out_up: + mutex_unlock(&sb->s_dquot.dqonoff_mutex); + return err; +} + +/** + * vzquota_put_super - one quota tree less on this superblock + * + * Called under vz_quota_mutex. + */ +void vzquota_put_super(struct super_block *sb) +{ + /* + * Even if this put is the last one, + * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop + * won't be called and the remaining qmblk references won't be put. + */ +} + +#endif + + +/* ---------------------------------------------------------------------- + * + * Helpers for inode -> qmblk link maintenance + * + * --------------------------------------------------------------------- */ + +#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd) +#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT) +#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops) +extern struct inode_operations vfs_empty_iops; + +static int VZ_QUOTA_IS_ACTUAL(struct inode *inode) +{ + struct vz_quota_master *qmblk; + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk == VZ_QUOTA_BAD) + return 1; + if (qmblk == __VZ_QUOTA_EMPTY) + return 0; + if (qmblk->dq_flags & VZDQ_NOACT) + /* not actual (invalidated) qmblk */ + return 0; + return 1; +} + +static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk) +{ + return qlnk->qmblk == __VZ_QUOTA_EMPTY; +} + +static inline void set_qlnk_origin(struct vz_quota_ilink *qlnk, + unsigned char origin) +{ + qlnk->origin[0] = qlnk->origin[1]; + qlnk->origin[1] = origin; +} + +static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk) +{ + qlnk->qmblk = __VZ_QUOTA_EMPTY; + set_qlnk_origin(qlnk, VZ_QUOTAO_SETE); +} + +void vzquota_qlnk_init(struct vz_quota_ilink *qlnk) +{ + memset(qlnk, 0, sizeof(*qlnk)); + INIT_LIST_HEAD(&qlnk->list); + vzquota_qlnk_set_empty(qlnk); + set_qlnk_origin(qlnk, VZ_QUOTAO_INIT); +} + +void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk) +{ + might_sleep(); + if (vzquota_qlnk_is_empty(qlnk)) + return; +#if defined(CONFIG_VZ_QUOTA_UGID) + if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) { + struct vz_quota_master *qmblk; + struct vz_quota_ugid *quid, *qgid; + qmblk = qlnk->qmblk; + quid = qlnk->qugid[USRQUOTA]; + qgid = qlnk->qugid[GRPQUOTA]; + if (quid != NULL || qgid != NULL) { + mutex_lock(&qmblk->dq_mutex); + if (qgid != NULL) + vzquota_put_ugid(qmblk, qgid); + if (quid != NULL) + vzquota_put_ugid(qmblk, quid); + mutex_unlock(&qmblk->dq_mutex); + } + } +#endif + if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) + qmblk_put(qlnk->qmblk); + set_qlnk_origin(qlnk, VZ_QUOTAO_DESTR); +} + +/** + * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents + * @qlt: temporary + * @qli: inode's + * + * Locking is provided by the caller (depending on the context). + * After swap, @qli is inserted into the corresponding dq_ilink_list, + * @qlt list is reinitialized. + */ +static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt, + struct vz_quota_ilink *qli) +{ + struct vz_quota_master *qb; + struct vz_quota_ugid *qu; + int i; + + qb = qlt->qmblk; + qlt->qmblk = qli->qmblk; + qli->qmblk = qb; + list_del_init(&qli->list); + if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD) + list_add(&qli->list, &qb->dq_ilink_list); + INIT_LIST_HEAD(&qlt->list); + set_qlnk_origin(qli, VZ_QUOTAO_SWAP); + + for (i = 0; i < MAXQUOTAS; i++) { + qu = qlt->qugid[i]; + qlt->qugid[i] = qli->qugid[i]; + qli->qugid[i] = qu; + } +} + +/** + * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks + * + * Called under dcache_lock and inode_qmblk locks. + * Returns 1 if locks were dropped inside, 0 if atomic. + */ +static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk, + struct inode *inode) +{ + if (vzquota_qlnk_is_empty(qlnk)) + return 0; + if (qlnk->qmblk == VZ_QUOTA_BAD) { + vzquota_qlnk_set_empty(qlnk); + set_qlnk_origin(qlnk, VZ_QUOTAO_RE_LOCK); + return 0; + } + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(qlnk); + vzquota_qlnk_init(qlnk); + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); + return 1; +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content + * + * Similar to vzquota_qlnk_reinit_locked, called under different locks. + */ +static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct vz_quota_master *qmblk) +{ + if (vzquota_qlnk_is_empty(qlnk)) + return 0; + /* may be optimized if qlnk->qugid all NULLs */ + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(qlnk); + vzquota_qlnk_init(qlnk); + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); + return 1; +} +#endif + +/** + * vzquota_qlnk_fill - fill vz_quota_ilink content + * @qlnk: vz_quota_ilink to fill + * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid) + * @qmblk: qmblk to which this @qlnk will belong + * + * Called under dcache_lock and inode_qmblk locks. + * Returns 1 if locks were dropped inside, 0 if atomic. + * @qlnk is expected to be empty. + */ +static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct vz_quota_master *qmblk) +{ + if (qmblk != VZ_QUOTA_BAD) + qmblk_get(qmblk); + qlnk->qmblk = qmblk; + +#if defined(CONFIG_VZ_QUOTA_UGID) + if (qmblk != VZ_QUOTA_BAD && + !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && + (qmblk->dq_flags & VZDQUG_ON)) { + struct vz_quota_ugid *quid, *qgid; + + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + + mutex_lock(&qmblk->dq_mutex); + quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); + qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); + mutex_unlock(&qmblk->dq_mutex); + + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); + qlnk->qugid[USRQUOTA] = quid; + qlnk->qugid[GRPQUOTA] = qgid; + return 1; + } +#endif + + return 0; +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid + * + * This function is a helper for vzquota_transfer, and differs from + * vzquota_qlnk_fill only by locking. + */ +static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct iattr *iattr, + int mask, + struct vz_quota_master *qmblk) +{ + qmblk_get(qmblk); + qlnk->qmblk = qmblk; + + if (mask) { + struct vz_quota_ugid *quid, *qgid; + + quid = qgid = NULL; /* to make gcc happy */ + if (!(mask & (1 << USRQUOTA))) + quid = vzquota_get_ugid(INODE_QLNK(inode)-> + qugid[USRQUOTA]); + if (!(mask & (1 << GRPQUOTA))) + qgid = vzquota_get_ugid(INODE_QLNK(inode)-> + qugid[GRPQUOTA]); + + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + + mutex_lock(&qmblk->dq_mutex); + if (mask & (1 << USRQUOTA)) + quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, + USRQUOTA, 0); + if (mask & (1 << GRPQUOTA)) + qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, + GRPQUOTA, 0); + mutex_unlock(&qmblk->dq_mutex); + + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); + qlnk->qugid[USRQUOTA] = quid; + qlnk->qugid[GRPQUOTA] = qgid; + return 1; + } + + return 0; +} +#endif + +/** + * __vzquota_inode_init - make sure inode's qlnk is initialized + * + * May be called if qlnk is already initialized, detects this situation itself. + * Called under inode_qmblk_lock. + */ +static void __vzquota_inode_init(struct inode *inode, unsigned char origin) +{ + if (inode->i_dquot[USRQUOTA] == NULL) { + vzquota_qlnk_init(INODE_QLNK(inode)); + inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NULL; + } + set_qlnk_origin(INODE_QLNK(inode), origin); +} + +/** + * vzquota_inode_drop - destroy VZ quota information in the inode + * + * Inode must not be externally accessible or dirty. + */ +static void vzquota_inode_drop(struct inode *inode) +{ + struct vz_quota_ilink qlnk; + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(inode->i_sb); + vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DRCAL); + inode->i_dquot[USRQUOTA] = NULL; + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk); +} + +/** + * vzquota_inode_qmblk_set - initialize inode's qlnk + * @inode: inode to be initialized + * @qmblk: quota master block to which this inode should belong (may be BAD) + * @qlnk: placeholder to store data to resolve locking issues + * + * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise. + * Called under dcache_lock and inode_qmblk locks. + * @qlnk will be destroyed in the caller chain. + * + * It is not mandatory to restart parent checks since quota on/off currently + * shrinks dentry tree and checks that there are not outside references. + * But if at some time that shink is removed, restarts will be required. + * Additionally, the restarts prevent inconsistencies if the dentry tree + * changes (inode is moved). This is not a big deal, but anyway... + */ +static int vzquota_inode_qmblk_set(struct inode *inode, + struct vz_quota_master *qmblk, + struct vz_quota_ilink *qlnk) +{ + if (qmblk == NULL) { + printk(KERN_ERR "VZDQ: NULL in set, orig {%u, %u}, " + "dev %s, inode %lu, fs %s\n", + INODE_QLNK(inode)->origin[0], + INODE_QLNK(inode)->origin[1], + inode->i_sb->s_id, inode->i_ino, + inode->i_sb->s_type->name); + printk(KERN_ERR "current %d (%s), VE %d\n", + current->pid, current->comm, + VEID(get_exec_env())); + dump_stack(); + qmblk = VZ_QUOTA_BAD; + } + while (1) { + if (vzquota_qlnk_is_empty(qlnk) && + vzquota_qlnk_fill(qlnk, inode, qmblk)) + return 1; + if (qlnk->qmblk == qmblk) + break; + if (vzquota_qlnk_reinit_locked(qlnk, inode)) + return 1; + } + vzquota_qlnk_swap(qlnk, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_QSET); + return 0; +} + + +/* ---------------------------------------------------------------------- + * + * vzquota_inode_qmblk (inode -> qmblk lookup) parts + * + * --------------------------------------------------------------------- */ + +static int vzquota_dparents_check_attach(struct inode *inode) +{ + if (!list_empty(&inode->i_dentry)) + return 0; + printk(KERN_ERR "VZDQ: no parent for " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + return -1; +} + +static struct inode *vzquota_dparents_check_actual(struct inode *inode) +{ + struct dentry *de; + + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent == de) /* detached dentry, perhaps */ + continue; + /* first access to parent, make sure its qlnk initialized */ + __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT); + if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode)) + return de->d_parent->d_inode; + } + return NULL; +} + +static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode) +{ + struct dentry *de; + struct vz_quota_master *qmblk; + + qmblk = NULL; + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent == de) /* detached dentry, perhaps */ + continue; + if (qmblk == NULL) { + qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk; + continue; + } + if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) { + printk(KERN_WARNING "VZDQ: multiple quotas for " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + qmblk = VZ_QUOTA_BAD; + break; + } + } + if (qmblk == NULL) { + printk(KERN_WARNING "VZDQ: not attached to tree, " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + qmblk = VZ_QUOTA_BAD; + } + return qmblk; +} + +/* NFS root is disconnected dentry. */ + +static int is_nfs_root(struct inode * inode) +{ + struct dentry *de; + + if (inode->i_sb->s_magic != 0x6969) + return 0; + + if (list_empty(&inode->i_dentry)) + return 0; + + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent != de) + return 0; + if (d_unhashed(de)) + return 0; + if (!(de->d_flags & DCACHE_DISCONNECTED)) + return 0; + } + return 1; +} + +static void vzquota_dbranch_actualize(struct inode *inode, + struct inode *refinode) +{ + struct inode *pinode; + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk; + + vzquota_qlnk_init(&qlnk); + +start: + if (inode == inode->i_sb->s_root->d_inode || is_nfs_root(inode)) { + /* filesystem root */ + atomic_inc(&inode->i_count); + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk)); + goto out; + } + + if (!vzquota_dparents_check_attach(inode)) { + pinode = vzquota_dparents_check_actual(inode); + if (pinode != NULL) { + inode = pinode; + goto start; + } + } + + atomic_inc(&inode->i_count); + while (1) { + if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */ + break; + /* + * Need to check parents again if we have slept inside + * vzquota_inode_qmblk_set() in the loop. + * If the state of parents is different, just return and repeat + * the actualizing process again from the inode passed to + * vzquota_inode_qmblk_recalc(). + */ + if (!vzquota_dparents_check_attach(inode)) { + if (vzquota_dparents_check_actual(inode) != NULL) + break; + qmblk = vzquota_dparents_check_same(inode); + } else + qmblk = VZ_QUOTA_BAD; + if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */ + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ACT); + break; + } + } + +out: + spin_unlock(&dcache_lock); + inode_qmblk_unlock(refinode->i_sb); + vzquota_qlnk_destroy(&qlnk); + iput(inode); + inode_qmblk_lock(refinode->i_sb); + spin_lock(&dcache_lock); +} + +static void vzquota_dtree_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + struct inode *pinode; + struct vz_quota_master *qmblk; + + if (inode == inode->i_sb->s_root->d_inode || is_nfs_root(inode)) { + /* filesystem root */ + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk)); + return; + } + +start: + if (VZ_QUOTA_IS_ACTUAL(inode)) + return; + /* + * Here qmblk is (re-)initialized for all ancestors. + * This is not a very efficient procedure, but it guarantees that + * the quota tree is consistent (that is, the inode doesn't have two + * ancestors with different qmblk). + */ + if (!vzquota_dparents_check_attach(inode)) { + pinode = vzquota_dparents_check_actual(inode); + if (pinode != NULL) { + vzquota_dbranch_actualize(pinode, inode); + goto start; + } + qmblk = vzquota_dparents_check_same(inode); + } else + qmblk = VZ_QUOTA_BAD; + + if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) + goto start; + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DTREE); +} + +static void vzquota_det_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + struct inode *parent; + struct vz_quota_master *qmblk; + char *msg; + int cnt; + time_t timeout; + + cnt = 0; + parent = NULL; +start: + /* + * qmblk of detached inodes shouldn't be considered as not actual. + * They are not in any dentry tree, so quota on/off shouldn't affect + * them. + */ + if (!vzquota_qlnk_is_empty(INODE_QLNK(inode))) + return; + + timeout = 3; + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + /* + * Scenario: + * open + * unlink + * quotaon + * generic_delete_inode + * + * This is the first time vzquota sees inode. inode is outside of + * vzquota area of interest, otherwise quotaon would have got -EBUSY + * due to shrink_dcache_parent(). + * inode is almost completely destroyed, so don't intervene. + * + * dev@: + * However, there is a small race here... + * dput() first removes itself from all the lists, + * so shrink_dcache_parent() can succeed while dentry_iput is not + * done yet. + */ + if (inode->i_state & I_FREEING) + goto set; + + msg = "detached inode not in creation"; + if (inode->i_op != VZ_QUOTA_EMPTY_IOPS) + goto fail; + qmblk = VZ_QUOTA_BAD; + msg = "unexpected creation context"; + if (!vzquota_cur_qmblk_check()) + goto fail; + timeout = 0; + parent = vzquota_cur_qmblk_fetch(); + msg = "uninitialized parent"; + if (vzquota_qlnk_is_empty(INODE_QLNK(parent))) + goto fail; + msg = "parent not in tree"; + if (list_empty(&parent->i_dentry)) + goto fail; + msg = "parent has 0 refcount"; + if (!atomic_read(&parent->i_count)) + goto fail; + msg = "parent has different sb"; + if (parent->i_sb != inode->i_sb) + goto fail; + if (!VZ_QUOTA_IS_ACTUAL(parent)) { + vzquota_dbranch_actualize(parent, inode); + goto start; + } + + qmblk = INODE_QLNK(parent)->qmblk; +set: + if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) + goto start; + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DET); + return; + +fail: + { + struct timeval tv, tvo; + do_gettimeofday(&tv); + memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo)); + tv.tv_sec -= tvo.tv_sec; + if (tv.tv_usec < tvo.tv_usec) { + tv.tv_sec--; + tv.tv_usec += USEC_PER_SEC - tvo.tv_usec; + } else + tv.tv_usec -= tvo.tv_usec; + if (tv.tv_sec < timeout) + goto set; + printk(KERN_ERR "VZDQ: %s, orig {%u, %u}," + " dev %s, inode %lu, fs %s\n", + msg, + INODE_QLNK(inode)->origin[0], + INODE_QLNK(inode)->origin[1], + inode->i_sb->s_id, inode->i_ino, + inode->i_sb->s_type->name); + printk(KERN_ERR "i_count %u, ", atomic_read(&inode->i_count)); + printk(KERN_ERR "i_mode %o, ", inode->i_mode); + printk(KERN_ERR "i_state %lx, ", inode->i_state); + printk(KERN_ERR "i_flags %x\n", inode->i_flags); + printk(KERN_ERR "i_op %p, vfs_empty_iops %p, " + "i_fop %p, i_mapping %p\n", + inode->i_op, &vfs_empty_iops, + inode->i_fop, inode->i_mapping); + if (!cnt++) { + printk(KERN_ERR "current %d (%s), VE %d," + " time %ld.%06ld\n", + current->pid, current->comm, + VEID(get_exec_env()), + tv.tv_sec, (long)tv.tv_usec); + dump_stack(); + } + if (parent != NULL) + printk(KERN_ERR "VZDQ: parent of %lu is %lu\n", + inode->i_ino, parent->i_ino); + } + goto set; +} + +static void vzquota_inode_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + spin_lock(&dcache_lock); + if (!list_empty(&inode->i_dentry)) + vzquota_dtree_qmblk_recalc(inode, qlnk); + else + vzquota_det_qmblk_recalc(inode, qlnk); + spin_unlock(&dcache_lock); +} + +/** + * vzquota_inode_qmblk - obtain inode's qmblk + * + * Returns qmblk with refcounter taken, %NULL if not under + * VZ quota or %VZ_QUOTA_BAD. + * + * FIXME: This function should be removed when vzquota_find_qmblk / + * get_quota_root / vzquota_dstat code is cleaned up. + */ +struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk; + + might_sleep(); + + if (inode->i_sb->dq_op != &vz_quota_operations) + return NULL; +#if defined(VZ_QUOTA_UNLOAD) +#error Make sure qmblk does not disappear +#endif + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(inode->i_sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) + vzquota_inode_qmblk_recalc(inode, &qlnk); + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) + qmblk_get(qmblk); + else + qmblk = NULL; + } + + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk); + return qmblk; +} + +/** + * vzquota_find_qmblk - helper to emulate quota on virtual filesystems + * + * This function finds a quota master block corresponding to the root of + * a virtual filesystem. + * Returns a quota master block with reference taken, or %NULL if not under + * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation + * operations will fail). + * + * Note: this function uses vzquota_inode_qmblk(). + * The latter is a rather confusing function: it returns qmblk that used to be + * on the inode some time ago (without guarantee that it still has any + * relations to the inode). So, vzquota_find_qmblk() leaves it up to the + * caller to think whether the inode could have changed its qmblk and what to + * do in that case. + * Currently, the callers appear to not care :( + */ +struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb) +{ + struct inode *qrinode; + struct vz_quota_master *qmblk; + + qmblk = NULL; + qrinode = NULL; + if (sb->s_op->get_quota_root != NULL) + qrinode = sb->s_op->get_quota_root(sb); + if (qrinode != NULL) + qmblk = vzquota_inode_qmblk(qrinode); + return qmblk; +} + +/* ---------------------------------------------------------------------- + * + * Calls from quota operations + * + * --------------------------------------------------------------------- */ + +/** + * vzquota_inode_init_call - call from DQUOT_INIT + */ +void vzquota_inode_init_call(struct inode *inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + /* initializes inode's quota inside */ + qmblk = vzquota_inode_data(inode, &data); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + vzquota_data_unlock(inode, &data); + + /* + * The check is needed for repeated new_inode() calls from a single + * ext3 call like create or mkdir in case of -ENOSPC. + */ + spin_lock(&dcache_lock); + if (!list_empty(&inode->i_dentry)) + vzquota_cur_qmblk_set(inode); + spin_unlock(&dcache_lock); +} + +void vzquota_inode_swap_call(struct inode *inode, struct inode *tmpl) +{ + struct vz_quota_master *qmblk; + + __vzquota_inode_init(inode, VZ_QUOTAO_INIT); + + might_sleep(); + + inode_qmblk_lock(tmpl->i_sb); + if (unlikely(tmpl->i_flags & S_NOQUOTA)) { + inode_qmblk_unlock(tmpl->i_sb); + return; + } + __vzquota_inode_init(tmpl, VZ_QUOTAO_INICAL); + + qmblk = INODE_QLNK(tmpl)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + void * uq; + list_del_init(&INODE_QLNK(tmpl)->list); + vzquota_qlnk_swap(INODE_QLNK(tmpl), INODE_QLNK(inode)); + uq = inode->i_dquot[USRQUOTA]; + inode->i_dquot[USRQUOTA] = tmpl->i_dquot[USRQUOTA]; + tmpl->i_dquot[USRQUOTA] = uq; + tmpl->i_flags |= S_NOQUOTA; + inode_qmblk_unlock(inode->i_sb); + + vzquota_inode_drop(tmpl); + } else { + inode_qmblk_unlock(tmpl->i_sb); + } +} + + +/** + * vzquota_inode_drop_call - call from DQUOT_DROP + */ +void vzquota_inode_drop_call(struct inode *inode) +{ + vzquota_inode_drop(inode); +} + +/** + * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs + * @inode: the inode + * @data: storage space + * + * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk. + * On return if qmblk is neither NULL nor VZ_QUOTA_BAD: + * qmblk in inode's qlnk is the same as returned, + * ugid pointers inside inode's qlnk are valid, + * some locks are taken (and should be released by vzquota_data_unlock). + * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken. + */ +struct vz_quota_master *vzquota_inode_data(struct inode *inode, + struct vz_quota_datast *data) +{ + struct vz_quota_master *qmblk; + + might_sleep(); + + vzquota_qlnk_init(&data->qlnk); + inode_qmblk_lock(inode->i_sb); + if (unlikely(inode->i_flags & S_NOQUOTA)) { + inode_qmblk_unlock(inode->i_sb); + return NULL; + } + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) + vzquota_inode_qmblk_recalc(inode, &data->qlnk); + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) { + /* + * Note that in the current implementation, + * inode_qmblk_lock can theoretically be dropped here. + * This place is serialized with quota_off because + * quota_off fails when there are extra dentry + * references and syncs inodes before removing quota + * information from them. + * However, quota usage information should stop being + * updated immediately after vzquota_off. + */ + qmblk_data_write_lock(qmblk); + } else { + inode_qmblk_unlock(inode->i_sb); + qmblk = NULL; + } + } else { + inode_qmblk_unlock(inode->i_sb); + } + return qmblk; +} + +void vzquota_data_unlock(struct inode *inode, + struct vz_quota_datast *data) +{ + qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&data->qlnk); +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_inode_transfer_call - call from vzquota_transfer + */ +int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + struct vz_quota_ilink qlnew; + int mask; + int ret; + + might_sleep(); + vzquota_qlnk_init(&qlnew); +start: + qmblk = vzquota_inode_data(inode, &data); + ret = NO_QUOTA; + if (qmblk == VZ_QUOTA_BAD) + goto out_destr; + ret = QUOTA_OK; + if (qmblk == NULL) + goto out_destr; + qmblk_get(qmblk); + + ret = QUOTA_OK; + if (!(qmblk->dq_flags & VZDQUG_ON)) + /* no ugid quotas */ + goto out_unlock; + + mask = 0; + if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid) + mask |= 1 << USRQUOTA; + if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid) + mask |= 1 << GRPQUOTA; + while (1) { + if (vzquota_qlnk_is_empty(&qlnew) && + vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk)) + break; + if (qlnew.qmblk == INODE_QLNK(inode)->qmblk && + qlnew.qmblk == qmblk) + goto finish; + if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk)) + break; + } + + /* prepare for restart */ + vzquota_data_unlock(inode, &data); + qmblk_put(qmblk); + goto start; + +finish: + /* all references obtained successfully */ + ret = vzquota_transfer_usage(inode, mask, &qlnew); + if (!ret) { + vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_TRANS); + } +out_unlock: + vzquota_data_unlock(inode, &data); + qmblk_put(qmblk); +out_destr: + vzquota_qlnk_destroy(&qlnew); + return ret; +} +#endif + +int vzquota_rename_check(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk1, qlnk2, qlnk3; + int c, ret; + + if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb) + return -1; + + might_sleep(); + + vzquota_qlnk_init(&qlnk1); + vzquota_qlnk_init(&qlnk2); + vzquota_qlnk_init(&qlnk3); + inode_qmblk_lock(inode->i_sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL); + __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL); + + do { + c = 0; + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) { + vzquota_inode_qmblk_recalc(inode, &qlnk1); + c++; + } + if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) || + !VZ_QUOTA_IS_ACTUAL(new_dir)) { + vzquota_inode_qmblk_recalc(new_dir, &qlnk2); + c++; + } + } while (c); + + ret = 0; + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != INODE_QLNK(new_dir)->qmblk) { + ret = -1; + while (vzquota_qlnk_is_empty(INODE_QLNK(old_dir)) || + !VZ_QUOTA_IS_ACTUAL(old_dir)) + vzquota_inode_qmblk_recalc(old_dir, &qlnk3); + if (qmblk != VZ_QUOTA_BAD && + !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && + qmblk->dq_root_path.dentry->d_inode == inode && + VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk, + inode->i_sb) && + VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk, + inode->i_sb)) + /* quota root rename is allowed */ + ret = 0; + } + + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk3); + vzquota_qlnk_destroy(&qlnk2); + vzquota_qlnk_destroy(&qlnk1); + return ret; +} + +/* + * Scan parent subdirs and find busy dentries names/path + * @parent: parent dentry + * @buf: buffer to store path. + */ +static void vzdquota_read_busy_dentries(struct path *parent, + char *buf, int buflen) +{ + struct dentry *this_parent = parent->dentry; + struct list_head *next; + char *res, *end, *start; + struct path root, path; + int len; + + if (!buf || buflen <= 0) + return; + + path.mnt = parent->mnt; + /* From d_path() ... */ + read_lock(¤t->fs->lock); + path_get(¤t->fs->root); + root = current->fs->root; + read_unlock(¤t->fs->lock); + + spin_lock(&dcache_lock); + + end = buf + buflen; + start = buf; +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry; + int subdirs; + + dentry = list_entry(tmp, struct dentry, d_u.d_child); + next = tmp->next; + subdirs = !list_empty(&dentry->d_subdirs); + + if (atomic_read(&dentry->d_count) && !subdirs) { + if (!buflen) + goto out; + /* + * Note: __d_path will store filename at the + * end of buf. + */ + path.dentry = dentry; + res = __d_path(&path, &root, buf, buflen); + /* Exit if name is too long */ + if (IS_ERR(res)) + goto out; + + /* + * Move the string obtained by __d_path, + * behind the last dentry path in buf. + */ + len = end - res; + BUG_ON(len <= 0); + + memmove(buf, res, len); + + /* Trick: replace \0 by \n */ + if (buf != start) + *(char *)(buf - 1) = '\n'; + + buf += len; + buflen -= len; + } + + /* + * Descend a level if the d_subdirs list is non-empty. + */ + if (subdirs) { + this_parent = dentry; + goto repeat; + } + } + /* + * All done at this level ... ascend and resume the search. + */ + if (this_parent != parent->dentry) { + next = this_parent->d_u.d_child.next; + this_parent = this_parent->d_parent; + goto resume; + } +out: + /* From d_path() ... */ + spin_unlock(&dcache_lock); + path_put(&root); +} + +/* ---------------------------------------------------------------------- + * + * qmblk-related parts of on/off operations + * + * --------------------------------------------------------------------- */ + +/** + * vzquota_check_dtree - check dentry tree if quota on/off is allowed + * + * This function doesn't allow quota to be turned on/off if some dentries in + * the tree have external references. + * In addition to technical reasons, it enforces user-space correctness: + * current usage (taken from or reported to the user space) can be meaningful + * and accurate only if the tree is not being modified. + * Side effect: additional vfsmount structures referencing the tree (bind + * mounts of tree nodes to some other places) are not allowed at on/off time. + * + * Store busy dentries path to the buf (if passed) in case of vzquota_off + * ioctl fail. + */ +int vzquota_check_dtree(struct vz_quota_master *qmblk, int off, + char *buf, int buflen) +{ + struct dentry *dentry; + int err, count; + + err = -EBUSY; + dentry = qmblk->dq_root_path.dentry; + + if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root) + goto unhashed; + + /* attempt to shrink */ + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dcache_lock); + inode_qmblk_unlock(dentry->d_sb); + shrink_dcache_parent(dentry); + inode_qmblk_lock(dentry->d_sb); + spin_lock(&dcache_lock); + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dcache_lock); + vzdquota_read_busy_dentries(&qmblk->dq_root_path, + buf, buflen); + spin_lock(&dcache_lock); + goto out; + } + + count = 1; + if (dentry == dentry->d_sb->s_root) + count += 2; /* sb and mnt refs */ + if (atomic_read(&dentry->d_count) < count) { + printk(KERN_ERR "%s: too small count %d vs %d.\n", + __FUNCTION__, + atomic_read(&dentry->d_count), count); + goto out; + } + if (atomic_read(&dentry->d_count) > count) + goto out; + } + + err = 0; +out: + return err; + +unhashed: + /* + * Quota root is removed. + * Allow to turn quota off, but not on. + */ + if (off) + err = 0; + goto out; +} + +int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, + struct vz_quota_master *qmblk, char __user *ubuf) +{ + struct vz_quota_ilink qlnk; + struct vz_quota_master *qold, *qnew; + int err; + char *buf; + + buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL; + + might_sleep(); + + qold = NULL; + qnew = vzquota_alloc_fake(); + if (qnew == NULL) { + free_page((unsigned long)buf); + return -ENOMEM; + } + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + spin_lock(&dcache_lock); + while (1) { + err = vzquota_check_dtree(qmblk, 0, buf, PAGE_SIZE); + if (err) + break; + if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)) + break; + } + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ON); + spin_unlock(&dcache_lock); + + if (!err) { + qold = __VZ_QUOTA_NOQUOTA(sb); + qold->dq_flags |= VZDQ_NOACT; + __VZ_QUOTA_NOQUOTA(sb) = qnew; + } + + inode_qmblk_unlock(sb); + vzquota_qlnk_destroy(&qlnk); + if (qold != NULL) + qmblk_put(qold); + + if (buf) { + if (copy_to_user(ubuf, buf, PAGE_SIZE)) + ; + free_page((unsigned long)buf); + } + return err; +} + +int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk, + char __user *ubuf, int force) +{ + int ret; + char *buf; + + buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL; + + ret = 0; + inode_qmblk_lock(sb); + + spin_lock(&dcache_lock); + if (vzquota_check_dtree(qmblk, 1, buf, PAGE_SIZE) && !force) + ret = -EBUSY; + spin_unlock(&dcache_lock); + + if (!ret) + qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT; + inode_qmblk_unlock(sb); + + if (buf) { + if (copy_to_user(ubuf, buf, PAGE_SIZE)) + ; + free_page((unsigned long)buf); + } + return ret; +} + + +/* ---------------------------------------------------------------------- + * + * External interfaces + * + * ---------------------------------------------------------------------*/ + +static int vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case VZCTL_QUOTA_NEW_CTL: { + struct vzctl_quotactl qb; + + err = -EFAULT; + if (copy_from_user(&qb, (void __user *)arg, sizeof(qb))) + break; + err = do_vzquotactl(qb.cmd, qb.quota_id, + qb.qstat, qb.ve_root, 0); + break; + } +#ifdef CONFIG_VZ_QUOTA_UGID + case VZCTL_QUOTA_UGID_CTL: { + struct vzctl_quotaugidctl qub; + + err = -EFAULT; + if (copy_from_user(&qub, (void __user *)arg, sizeof(qub))) + break; + err = do_vzquotaugidctl(qub.cmd, qub.quota_id, + qub.ugid_index, qub.ugid_size, qub.addr, 0); + break; + } +#endif + default: + err = -ENOTTY; + } + return err; +} + +#ifdef CONFIG_COMPAT +static int compat_vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case VZCTL_COMPAT_QUOTA_CTL: { + struct compat_vzctl_quotactl cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + err = do_vzquotactl(cs.cmd, cs.quota_id, + compat_ptr(cs.qstat), + compat_ptr(cs.ve_root), 1); + break; + } +#ifdef CONFIG_VZ_QUOTA_UGID + case VZCTL_COMPAT_QUOTA_UGID_CTL: { + struct compat_vzctl_quotaugidctl cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + err = do_vzquotaugidctl(cs.cmd, cs.quota_id, cs.ugid_index, + cs.ugid_size, compat_ptr(cs.addr), 1); + break; + } +#endif + default: + err = -ENOIOCTLCMD; + } + return err; +} +#endif + +static struct vzioctlinfo vzdqcalls = { + .type = VZDQCTLTYPE, + .ioctl = vzquota_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_vzquota_ioctl, +#endif + .owner = THIS_MODULE, +}; + +/** + * vzquota_dstat - get quota usage info for virtual superblock + */ +static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat) +{ + struct vz_quota_master *qmblk; + + qmblk = vzquota_find_qmblk(super); + if (qmblk == NULL) + return -ENOENT; + if (qmblk == VZ_QUOTA_BAD) { + memset(qstat, 0, sizeof(*qstat)); + return 0; + } + + qmblk_data_read_lock(qmblk); + memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat)); + qmblk_data_read_unlock(qmblk); + qmblk_put(qmblk); + return 0; +} + + +/* ---------------------------------------------------------------------- + * + * Init/exit helpers + * + * ---------------------------------------------------------------------*/ + +static int vzquota_cache_init(void) +{ + int i; + + vzquota_cachep = kmem_cache_create("vz_quota_master", + sizeof(struct vz_quota_master), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (vzquota_cachep == NULL) { + printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); + goto nomem2; + } + for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) + INIT_LIST_HEAD(&vzquota_hash_table[i]); + + return 0; + +nomem2: + return -ENOMEM; +} + +static void vzquota_cache_release(void) +{ + int i; + + /* sanity check */ + for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) + if (!list_empty(&vzquota_hash_table[i])) + BUG(); + + /* release caches */ + kmem_cache_destroy(vzquota_cachep); + vzquota_cachep = NULL; +} + +static int quota_notifier_call(struct vnotifier_block *self, + unsigned long n, void *data, int err) +{ + struct virt_info_quota *viq; + struct super_block *sb; + + viq = (struct virt_info_quota *)data; + switch (n) { + case VIRTINFO_QUOTA_ON: + err = NOTIFY_BAD; + if (!try_module_get(THIS_MODULE)) + break; + sb = viq->super; + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_OFF: + module_put(THIS_MODULE); + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_GETSTAT: + err = NOTIFY_BAD; + if (vzquota_dstat(viq->super, viq->qstat)) + break; + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_DISABLE: + err = NOTIFY_OK; + vzquota_inode_off((struct inode *)data); + break; + } + return err; +} + +struct vnotifier_block quota_notifier_block = { + .notifier_call = quota_notifier_call, + .priority = INT_MAX, +}; + +/* ---------------------------------------------------------------------- + * + * Init/exit procedures + * + * ---------------------------------------------------------------------*/ + +static int __init vzquota_init(void) +{ + int err; + + if ((err = vzquota_cache_init()) != 0) + goto out_cache; + + if ((err = vzquota_proc_init()) != 0) + goto out_proc; + +#ifdef CONFIG_VZ_QUOTA_UGID + if ((err = vzquota_ugid_init()) != 0) + goto out_ugid; +#endif + + mutex_init(&vz_quota_mutex); + vzioctl_register(&vzdqcalls); + virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); +#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) + vzaquota_init(); +#endif + + return 0; + +#ifdef CONFIG_VZ_QUOTA_UGID +out_ugid: + vzquota_proc_release(); +#endif +out_proc: + vzquota_cache_release(); +out_cache: + return err; +} + +#if defined(VZ_QUOTA_UNLOAD) +static void __exit vzquota_release(void) +{ + virtinfo_notifier_unregister(VITYPE_QUOTA, "a_notifier_block); + vzioctl_unregister(&vzdqcalls); +#ifdef CONFIG_VZ_QUOTA_UGID +#ifdef CONFIG_PROC_FS + vzaquota_fini(); +#endif + vzquota_ugid_release(); +#endif + vzquota_proc_release(); + vzquota_cache_release(); +} +#endif + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Disk Quota"); +MODULE_LICENSE("GPL v2"); + +module_init(vzquota_init) +#if defined(VZ_QUOTA_UNLOAD) +module_exit(vzquota_release) +#endif diff -urNp linux-2.6.32.48/fs/read_write.c linux-2.6.32.48-openvz/fs/read_write.c --- linux-2.6.32.48/fs/read_write.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/read_write.c 2011-11-21 17:40:45.000000000 -0500 @@ -21,6 +21,8 @@ #include #include +#include + const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -369,6 +371,29 @@ static inline void file_pos_write(struct file->f_pos = pos; } +static inline void bc_acct_write(size_t bytes) +{ + struct user_beancounter *ub; + + if (bytes > 0) { + ub = get_exec_ub(); + ub_percpu_inc(ub, write); + ub_percpu_add(ub, wchar, bytes); + } +} + +static inline void bc_acct_read(size_t bytes) +{ + struct user_beancounter *ub; + + if (bytes > 0) { + ub = get_exec_ub(); + ub_percpu_inc(ub, read); + ub_percpu_add(ub, rchar, bytes); + } +} + + SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { struct file *file; @@ -381,6 +406,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, ret = vfs_read(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } return ret; @@ -399,6 +426,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, ret = vfs_write(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } return ret; @@ -420,6 +449,8 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, if (file->f_mode & FMODE_PREAD) ret = vfs_read(file, buf, count, &pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } return ret; @@ -449,6 +480,8 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd if (file->f_mode & FMODE_PWRITE) ret = vfs_write(file, buf, count, &pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } return ret; @@ -702,6 +735,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd ret = vfs_readv(file, vec, vlen, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } if (ret > 0) @@ -723,6 +758,8 @@ SYSCALL_DEFINE3(writev, unsigned long, f ret = vfs_writev(file, vec, vlen, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } if (ret > 0) diff -urNp linux-2.6.32.48/fs/reiserfs/namei.c linux-2.6.32.48-openvz/fs/reiserfs/namei.c --- linux-2.6.32.48/fs/reiserfs/namei.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/reiserfs/namei.c 2011-11-21 17:40:45.000000000 -0500 @@ -826,6 +826,9 @@ static int reiserfs_rmdir(struct inode * INITIALIZE_PATH(path); struct reiserfs_dir_entry de; + inode = dentry->d_inode; + vfs_dq_init(inode); + /* we will be doing 2 balancings and update 2 stat data, we change quotas * of the owner of the directory and of the owner of the parent directory. * The quota structure is possibly deleted only on last iput => outside @@ -850,8 +853,6 @@ static int reiserfs_rmdir(struct inode * goto end_rmdir; } - inode = dentry->d_inode; - reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); @@ -915,6 +916,7 @@ static int reiserfs_unlink(struct inode unsigned long savelink; inode = dentry->d_inode; + vfs_dq_init(inode); /* in this transaction we can be doing at max two balancings and update * two stat datas, we change quotas of the owner of the directory and of @@ -1228,6 +1230,8 @@ static int reiserfs_rename(struct inode old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; + if (new_dentry_inode) + vfs_dq_init(new_dentry_inode); // make sure, that oldname still exists and points to an object we // are going to rename diff -urNp linux-2.6.32.48/fs/select.c linux-2.6.32.48-openvz/fs/select.c --- linux-2.6.32.48/fs/select.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/select.c 2011-11-21 17:40:45.000000000 -0500 @@ -29,6 +29,7 @@ #include +#include /* * Estimate expected accuracy in ns from a timeval. @@ -551,7 +552,8 @@ int core_sys_select(int n, fd_set __user if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + bits = kmalloc(6 * size, size > PAGE_SIZE / 6 ? + GFP_KERNEL_UBC : GFP_KERNEL); if (!bits) goto out_nofds; } @@ -841,7 +843,7 @@ int do_sys_poll(struct pollfd __user *uf len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; - walk = walk->next = kmalloc(size, GFP_KERNEL); + walk = walk->next = kmalloc(size, GFP_KERNEL_UBC); if (!walk) { err = -ENOMEM; goto out_fds; @@ -873,7 +875,7 @@ out_fds: return err; } -static long do_restart_poll(struct restart_block *restart_block) +long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; @@ -894,6 +896,7 @@ static long do_restart_poll(struct resta } return ret; } +EXPORT_SYMBOL_GPL(do_restart_poll); SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, long, timeout_msecs) diff -urNp linux-2.6.32.48/fs/seq_file.c linux-2.6.32.48-openvz/fs/seq_file.c --- linux-2.6.32.48/fs/seq_file.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/seq_file.c 2011-11-21 17:40:45.000000000 -0500 @@ -32,7 +32,7 @@ int seq_open(struct file *file, const st struct seq_file *p = file->private_data; if (!p) { - p = kmalloc(sizeof(*p), GFP_KERNEL); + p = kmalloc(sizeof(*p), GFP_KERNEL_UBC); if (!p) return -ENOMEM; file->private_data = p; @@ -76,7 +76,7 @@ static int traverse(struct seq_file *m, return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC); if (!m->buf) return -ENOMEM; } @@ -116,7 +116,7 @@ static int traverse(struct seq_file *m, Eoverflow: m->op->stop(m, p); kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -169,7 +169,7 @@ ssize_t seq_read(struct file *file, char m->version = file->f_version; /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC); if (!m->buf) goto Enomem; } @@ -210,7 +210,7 @@ ssize_t seq_read(struct file *file, char goto Fill; m->op->stop(m, p); kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC); if (!m->buf) goto Enomem; m->count = 0; @@ -435,6 +435,8 @@ int seq_path(struct seq_file *m, struct if (size) { char *p = d_path(path, buf, size); + if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG) + return 0; if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) @@ -551,7 +553,7 @@ static void single_stop(struct seq_file int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { - struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_UBC); int res = -ENOMEM; if (op) { @@ -595,7 +597,7 @@ void *__seq_open_private(struct file *f, void *private; struct seq_file *seq; - private = kzalloc(psize, GFP_KERNEL); + private = kzalloc(psize, GFP_KERNEL_UBC); if (private == NULL) goto out; diff -urNp linux-2.6.32.48/fs/signalfd.c linux-2.6.32.48-openvz/fs/signalfd.c --- linux-2.6.32.48/fs/signalfd.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/signalfd.c 2011-11-21 17:40:45.000000000 -0500 @@ -28,10 +28,7 @@ #include #include #include - -struct signalfd_ctx { - sigset_t sigmask; -}; +#include static int signalfd_release(struct inode *inode, struct file *file) { @@ -201,17 +198,17 @@ static ssize_t signalfd_read(struct file return total ? total: ret; } -static const struct file_operations signalfd_fops = { +const struct file_operations signalfd_fops = { .release = signalfd_release, .poll = signalfd_poll, .read = signalfd_read, }; +EXPORT_SYMBOL(signalfd_fops); SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, size_t, sizemask, int, flags) { sigset_t sigmask; - struct signalfd_ctx *ctx; /* Check the SFD_* constants for consistency. */ BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); @@ -226,12 +223,19 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sig sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); signotset(&sigmask); + return do_signalfd(ufd, &sigmask, flags); +} + +long do_signalfd(int ufd, sigset_t *sigmask, int flags) +{ + struct signalfd_ctx *ctx; + if (ufd == -1) { ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; - ctx->sigmask = sigmask; + ctx->sigmask = *sigmask; /* * When we call this, the initialization must be complete, since @@ -251,7 +255,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sig return -EINVAL; } spin_lock_irq(¤t->sighand->siglock); - ctx->sigmask = sigmask; + ctx->sigmask = *sigmask; spin_unlock_irq(¤t->sighand->siglock); wake_up(¤t->sighand->signalfd_wqh); @@ -260,6 +264,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sig return ufd; } +EXPORT_SYMBOL_GPL(do_signalfd); SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, size_t, sizemask) diff -urNp linux-2.6.32.48/fs/simfs.c linux-2.6.32.48-openvz/fs/simfs.c --- linux-2.6.32.48/fs/simfs.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/simfs.c 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,339 @@ +/* + * fs/simfs.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb + +static struct super_operations sim_super_ops; + +static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct super_block *sb; + struct inode *inode; + + inode = dentry->d_inode; + if (!inode->i_op->getattr) { + generic_fillattr(inode, stat); + if (!stat->blksize) { + unsigned blocks; + + sb = inode->i_sb; + blocks = (stat->size + sb->s_blocksize-1) >> + sb->s_blocksize_bits; + stat->blocks = (sb->s_blocksize / 512) * blocks; + stat->blksize = sb->s_blocksize; + } + } else { + int err; + + err = inode->i_op->getattr(mnt, dentry, stat); + if (err) + return err; + } + + if (!mnt) + return 0; + sb = mnt->mnt_sb; + if (sb->s_op == &sim_super_ops) + stat->dev = sb->s_dev; + return 0; +} + +static void quota_get_stat(struct super_block *sb, struct kstatfs *buf) +{ + int err; + struct dq_stat qstat; + struct virt_info_quota q; + long free_file, adj_file; + s64 blk, free_blk, adj_blk; + int bsize_bits; + + q.super = sb; + q.qstat = &qstat; + err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q); + if (err != NOTIFY_OK) + return; + + bsize_bits = ffs(buf->f_bsize) - 1; + + if (qstat.bsoftlimit > qstat.bcurrent) + free_blk = (qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits; + else + free_blk = 0; + /* + * In the regular case, we always set buf->f_bfree and buf->f_blocks to + * the values reported by quota. In case of real disk space shortage, + * we adjust the values. We want this adjustment to look as if the + * total disk space were reduced, not as if the usage were increased. + * -- SAW + */ + adj_blk = 0; + if (buf->f_bfree < free_blk) + adj_blk = free_blk - buf->f_bfree; + buf->f_bfree = free_blk - adj_blk; + + if (free_blk < buf->f_bavail) + buf->f_bavail = free_blk; + + blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk; + buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk; + + free_file = qstat.isoftlimit - qstat.icurrent; + if (free_file < 0) + free_file = 0; + if (buf->f_type == REISERFS_SUPER_MAGIC) + /* + * reiserfs doesn't initialize f_ffree and f_files values of + * kstatfs because it doesn't have an inode limit. + */ + buf->f_ffree = free_file; + adj_file = 0; + if (buf->f_ffree < free_file) + adj_file = free_file - buf->f_ffree; + buf->f_ffree = free_file - adj_file; + buf->f_files = qstat.isoftlimit - adj_file; +} + +static int sim_statfs(struct super_block *sb, struct kstatfs *buf) +{ + int err; + struct super_block *lsb; + struct kstatfs statbuf; + + err = 0; + if (sb->s_op != &sim_super_ops) + return 0; + + memset(&statbuf, 0, sizeof(statbuf)); + lsb = SIMFS_GET_LOWER_FS_SB(sb); + + err = -ENOSYS; + if (lsb && lsb->s_op && lsb->s_op->statfs) + err = lsb->s_op->statfs(sb->s_root, &statbuf); + if (err) + return err; + + quota_get_stat(sb, &statbuf); + + buf->f_files = statbuf.f_files; + buf->f_ffree = statbuf.f_ffree; + buf->f_blocks = statbuf.f_blocks; + buf->f_bfree = statbuf.f_bfree; + buf->f_bavail = statbuf.f_bavail; + return 0; +} + +static int sim_systemcall(struct vnotifier_block *me, unsigned long n, + void *d, int old_ret) +{ + int err; + + switch (n) { + case VIRTINFO_FAUDIT_STAT: { + struct faudit_stat_arg *arg; + + arg = (struct faudit_stat_arg *)d; + err = sim_getattr(arg->mnt, arg->dentry, arg->stat); + arg->err = err; + } + break; + case VIRTINFO_FAUDIT_STATFS: { + struct faudit_statfs_arg *arg; + + arg = (struct faudit_statfs_arg *)d; + err = sim_statfs(arg->sb, arg->stat); + arg->err = err; + } + break; + default: + return old_ret; + } + return (err ? NOTIFY_BAD : NOTIFY_OK); +} + +#ifdef CONFIG_QUOTA +static struct inode *sim_quota_root(struct super_block *sb) +{ + return sb->s_root->d_inode; +} +#endif + +/* + * NOTE: We need to setup s_bdev field on super block, since sys_quotactl() + * does lookup_bdev() and get_super() which are comparing sb->s_bdev. + * so this is a MUST if we want unmodified sys_quotactl + * to work correctly on /dev/simfs inside VE + */ +static int sim_init_blkdev(struct super_block *sb) +{ + static struct hd_struct fake_hd; + struct block_device *blkdev; + + blkdev = bdget(sb->s_dev); + if (blkdev == NULL) + return -ENOMEM; + + blkdev->bd_part = &fake_hd; /* required for bdev_read_only() */ + sb->s_bdev = blkdev; + + return 0; +} + +static void sim_free_blkdev(struct super_block *sb) +{ + /* set bd_part back to NULL */ + sb->s_bdev->bd_part = NULL; + bdput(sb->s_bdev); +} + +static void sim_quota_init(struct super_block *sb) +{ + struct virt_info_quota viq; + + viq.super = sb; + virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq); +} + +static void sim_quota_free(struct super_block *sb) +{ + struct virt_info_quota viq; + + viq.super = sb; + virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq); +} + +static struct super_operations sim_super_ops = { +#ifdef CONFIG_QUOTA + .get_quota_root = sim_quota_root, +#endif +}; + +static int sim_fill_super(struct super_block *s, void *data) +{ + int err; + struct nameidata *nd; + + err = set_anon_super(s, NULL); + if (err) + goto out; + + err = 0; + nd = (struct nameidata *)data; + s->s_fs_info = mntget(nd->path.mnt); + s->s_root = dget(nd->path.dentry); + s->s_op = &sim_super_ops; +out: + return err; +} + +static int sim_get_sb(struct file_system_type *type, int flags, + const char *dev_name, void *opt, struct vfsmount *mnt) +{ + int err; + struct nameidata nd; + struct super_block *sb; + + err = -EINVAL; + if (opt == NULL) + goto out; + + err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (err) + goto out; + + sb = sget(type, NULL, sim_fill_super, &nd); + err = PTR_ERR(sb); + if (IS_ERR(sb)) + goto out_path; + + err = sim_init_blkdev(sb); + if (err) + goto out_killsb; + + sim_quota_init(sb); + + path_put(&nd.path); + simple_set_mnt(mnt, sb); + return 0; + +out_killsb: + up_write(&sb->s_umount); + deactivate_super(sb); +out_path: + path_put(&nd.path); +out: + return err; +} + +static void sim_kill_sb(struct super_block *sb) +{ + dput(sb->s_root); + sb->s_root = NULL; + mntput((struct vfsmount *)(sb->s_fs_info)); + + sim_quota_free(sb); + sim_free_blkdev(sb); + + kill_anon_super(sb); +} + +static struct file_system_type sim_fs_type = { + .owner = THIS_MODULE, + .name = "simfs", + .get_sb = sim_get_sb, + .kill_sb = sim_kill_sb, + .fs_flags = FS_MANGLE_PROC, +}; + +static struct vnotifier_block sim_syscalls = { + .notifier_call = sim_systemcall, +}; + +static int __init init_simfs(void) +{ + int err; + + err = register_filesystem(&sim_fs_type); + if (err) + return err; + + virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls); + return 0; +} + +static void __exit exit_simfs(void) +{ + virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls); + unregister_filesystem(&sim_fs_type); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System"); +MODULE_LICENSE("GPL v2"); + +module_init(init_simfs); +module_exit(exit_simfs); diff -urNp linux-2.6.32.48/fs/smbfs/sock.c linux-2.6.32.48-openvz/fs/smbfs/sock.c --- linux-2.6.32.48/fs/smbfs/sock.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/smbfs/sock.c 2011-11-21 17:40:45.000000000 -0500 @@ -99,6 +99,7 @@ smb_close_socket(struct smb_sb_info *ser VERBOSE("closing socket %p\n", sock); sock->sk->sk_data_ready = server->data_ready; + sock->sk->sk_user_data = NULL; server->sock_file = NULL; fput(file); } diff -urNp linux-2.6.32.48/fs/stat.c linux-2.6.32.48-openvz/fs/stat.c --- linux-2.6.32.48/fs/stat.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/stat.c 2011-11-21 17:40:45.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -41,11 +42,19 @@ int vfs_getattr(struct vfsmount *mnt, st { struct inode *inode = dentry->d_inode; int retval; + struct faudit_stat_arg arg; retval = security_inode_getattr(mnt, dentry); if (retval) return retval; + arg.mnt = mnt; + arg.dentry = dentry; + arg.stat = stat; + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg) + != NOTIFY_DONE) + return arg.err; + if (inode->i_op->getattr) return inode->i_op->getattr(mnt, dentry, stat); diff -urNp linux-2.6.32.48/fs/super.c linux-2.6.32.48-openvz/fs/super.c --- linux-2.6.32.48/fs/super.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/super.c 2011-11-21 17:40:45.000000000 -0500 @@ -37,12 +37,15 @@ #include #include #include +#include #include #include "internal.h" LIST_HEAD(super_blocks); +EXPORT_SYMBOL_GPL(super_blocks); DEFINE_SPINLOCK(sb_lock); +EXPORT_SYMBOL_GPL(sb_lock); /** * alloc_super - create new superblock @@ -69,13 +72,15 @@ static struct super_block *alloc_super(s INIT_LIST_HEAD(&s->s_dentry_lru); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); - lockdep_set_class(&s->s_umount, &type->s_umount_key); + lockdep_set_class(&s->s_umount, + &type->proto->s_umount_key); /* * The locking rules for s_lock are up to the * filesystem. For example ext3fs has different * lock ordering than usbfs: */ - lockdep_set_class(&s->s_lock, &type->s_lock_key); + lockdep_set_class(&s->s_lock, + &type->proto->s_lock_key); /* * sget() can have s_umount recursion. * @@ -307,11 +312,13 @@ void generic_shutdown_super(struct super /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); + if (sb->dq_op && sb->dq_op->shutdown) + sb->dq_op->shutdown(sb); if (sop->put_super) sop->put_super(sb); /* Forget any remaining inodes */ - if (invalidate_inodes(sb)) { + if (invalidate_inodes_check(sb, 1)) { printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); @@ -531,17 +538,26 @@ rescan: spin_unlock(&sb_lock); return NULL; } +EXPORT_SYMBOL(user_get_super); SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) { + dev_t kdev; struct super_block *s; struct ustat tmp; struct kstatfs sbuf; - int err = -EINVAL; + int err; + + kdev = new_decode_dev(dev); + err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ); + if (err) + goto out; + + err = -EINVAL; + s = user_get_super(kdev); + if (s == NULL) + goto out; - s = user_get_super(new_decode_dev(dev)); - if (s == NULL) - goto out; err = vfs_statfs(s->s_root, &sbuf); drop_super(s); if (err) @@ -653,6 +669,13 @@ static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* for compatibility with coreutils still unaware of new minor sizes */ +int unnamed_dev_majors[] = { + 0, 144, 145, 146, 242, 243, 244, 245, + 246, 247, 248, 249, 250, 251, 252, 253 +}; +EXPORT_SYMBOL(unnamed_dev_majors); + int set_anon_super(struct super_block *s, void *data) { int dev; @@ -672,7 +695,7 @@ int set_anon_super(struct super_block *s else if (error) return -EAGAIN; - if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) @@ -680,7 +703,7 @@ int set_anon_super(struct super_block *s spin_unlock(&unnamed_dev_lock); return -EMFILE; } - s->s_dev = MKDEV(0, dev & MINORMASK); + s->s_dev = make_unnamed_dev(dev); return 0; } @@ -688,8 +711,9 @@ EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { - int slot = MINOR(sb->s_dev); + int slot; + slot = unnamed_dev_idx(sb->s_dev); generic_shutdown_super(sb); spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, slot); diff -urNp linux-2.6.32.48/fs/sync.c linux-2.6.32.48-openvz/fs/sync.c --- linux-2.6.32.48/fs/sync.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sync.c 2011-11-21 17:40:45.000000000 -0500 @@ -15,6 +15,8 @@ #include #include "internal.h" +#include + #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) @@ -127,11 +129,18 @@ restart: */ SYSCALL_DEFINE0(sync) { + struct user_beancounter *ub; + + ub = get_exec_ub(); + ub_percpu_inc(ub, sync); + wakeup_flusher_threads(0); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) laptop_sync_completion(); + + ub_percpu_inc(ub, sync_done); return 0; } @@ -207,6 +216,7 @@ int vfs_fsync_range(struct file *file, s const struct file_operations *fop; struct address_space *mapping; int err, ret; + struct user_beancounter *ub; /* * Get mapping and operations from the file in case we have @@ -226,6 +236,12 @@ int vfs_fsync_range(struct file *file, s goto out; } + ub = get_exec_ub(); + if (datasync) + ub_percpu_inc(ub, fdsync); + else + ub_percpu_inc(ub, fsync); + ret = filemap_write_and_wait_range(mapping, start, end); /* @@ -238,6 +254,10 @@ int vfs_fsync_range(struct file *file, s ret = err; mutex_unlock(&mapping->host->i_mutex); + if (datasync) + ub_percpu_inc(ub, fdsync_done); + else + ub_percpu_inc(ub, fsync_done); out: return ret; } @@ -444,12 +464,16 @@ int do_sync_mapping_range(struct address loff_t endbyte, unsigned int flags) { int ret; + struct user_beancounter *ub; if (!mapping) { ret = -EINVAL; - goto out; + goto out_noacct; } + ub = get_exec_ub(); + ub_percpu_inc(ub, frsync); + ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { ret = wait_on_page_writeback_range(mapping, @@ -472,6 +496,8 @@ int do_sync_mapping_range(struct address endbyte >> PAGE_CACHE_SHIFT); } out: + ub_percpu_inc(ub, frsync_done); +out_noacct: return ret; } EXPORT_SYMBOL_GPL(do_sync_mapping_range); diff -urNp linux-2.6.32.48/fs/sysfs/bin.c linux-2.6.32.48-openvz/fs/sysfs/bin.c --- linux-2.6.32.48/fs/sysfs/bin.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sysfs/bin.c 2011-11-21 17:40:45.000000000 -0500 @@ -398,6 +398,9 @@ static int open(struct inode * inode, st struct bin_buffer *bb = NULL; int error; + if (!ve_sysfs_alowed()) + return 0; + /* binary file operations requires both @sd and its parent */ if (!sysfs_get_active_two(attr_sd)) return -ENODEV; @@ -485,6 +488,9 @@ void unmap_bin_file(struct sysfs_dirent int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) { + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); @@ -499,6 +505,8 @@ int sysfs_create_bin_file(struct kobject void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) { + if (!ve_sysfs_alowed()) + return; sysfs_hash_and_remove(kobj->sd, attr->attr.name); } diff -urNp linux-2.6.32.48/fs/sysfs/dir.c linux-2.6.32.48-openvz/fs/sysfs/dir.c --- linux-2.6.32.48/fs/sysfs/dir.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sysfs/dir.c 2011-11-21 17:40:45.000000000 -0500 @@ -539,6 +539,9 @@ static void sysfs_drop_dentry(struct sys struct inode *inode; struct dentry *dentry; + if (!ve_sysfs_alowed()) + return; + inode = ilookup(sysfs_sb, sd->s_ino); if (!inode) return; @@ -712,12 +715,15 @@ int sysfs_create_dir(struct kobject * ko struct sysfs_dirent *parent_sd, *sd; int error = 0; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj); if (kobj->parent) parent_sd = kobj->parent->sd; else - parent_sd = &sysfs_root; + parent_sd = ve_sysfs_root; error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); if (!error) @@ -819,6 +825,9 @@ void sysfs_remove_dir(struct kobject * k { struct sysfs_dirent *sd = kobj->sd; + if (!ve_sysfs_alowed()) + return; + spin_lock(&sysfs_assoc_lock); kobj->sd = NULL; spin_unlock(&sysfs_assoc_lock); @@ -834,6 +843,9 @@ int sysfs_rename_dir(struct kobject * ko const char *dup_name = NULL; int error; + if (!ve_sysfs_alowed()) + return 0; + mutex_lock(&sysfs_rename_mutex); error = 0; @@ -899,7 +911,7 @@ int sysfs_move_dir(struct kobject *kobj, mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ? - new_parent_kobj->sd : &sysfs_root; + new_parent_kobj->sd : ve_sysfs_root; error = 0; if (sd->s_parent == new_parent_sd) diff -urNp linux-2.6.32.48/fs/sysfs/file.c linux-2.6.32.48-openvz/fs/sysfs/file.c --- linux-2.6.32.48/fs/sysfs/file.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sysfs/file.c 2011-11-21 17:40:45.000000000 -0500 @@ -536,6 +536,8 @@ int sysfs_add_file(struct sysfs_dirent * int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) { + if (!ve_sysfs_alowed()) + return 0; BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); @@ -634,6 +636,8 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { + if (!ve_sysfs_alowed()) + return; sysfs_hash_and_remove(kobj->sd, attr->name); } diff -urNp linux-2.6.32.48/fs/sysfs/group.c linux-2.6.32.48-openvz/fs/sysfs/group.c --- linux-2.6.32.48/fs/sysfs/group.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sysfs/group.c 2011-11-21 17:40:45.000000000 -0500 @@ -62,6 +62,8 @@ static int internal_create_group(struct struct sysfs_dirent *sd; int error; + if (!ve_sysfs_alowed()) + return 0; BUG_ON(!kobj || (!update && !kobj->sd)); /* Updates may happen before the object has been instantiated */ @@ -131,6 +133,9 @@ void sysfs_remove_group(struct kobject * struct sysfs_dirent *dir_sd = kobj->sd; struct sysfs_dirent *sd; + if (!ve_sysfs_alowed()) + return; + if (grp->name) { sd = sysfs_get_dirent(dir_sd, grp->name); if (!sd) { diff -urNp linux-2.6.32.48/fs/sysfs/inode.c linux-2.6.32.48-openvz/fs/sysfs/inode.c --- linux-2.6.32.48/fs/sysfs/inode.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sysfs/inode.c 2011-11-21 17:40:45.000000000 -0500 @@ -22,8 +22,6 @@ #include #include "sysfs.h" -extern struct super_block * sysfs_sb; - static const struct address_space_operations sysfs_aops = { .readpage = simple_readpage, .write_begin = simple_write_begin, diff -urNp linux-2.6.32.48/fs/sysfs/mount.c linux-2.6.32.48-openvz/fs/sysfs/mount.c --- linux-2.6.32.48/fs/sysfs/mount.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sysfs/mount.c 2011-11-21 17:40:45.000000000 -0500 @@ -22,8 +22,22 @@ #include "sysfs.h" -static struct vfsmount *sysfs_mount; +#ifndef CONFIG_VE +struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; +#endif + +#ifdef CONFIG_SYSFS_DEPRECATED_DYN +unsigned sysfs_deprecated __read_mostly; + +static int __init sysfs_init_deprecated(char *str) +{ + sysfs_deprecated = 1; + return 1; +} +__setup("old_sysfs", sysfs_init_deprecated); +#endif + struct kmem_cache *sysfs_dir_cachep; static const struct super_operations sysfs_ops = { @@ -40,6 +54,13 @@ struct sysfs_dirent sysfs_root = { .s_ino = 1, }; +static void init_ve0_sysfs_root(void) +{ +#ifdef CONFIG_VE + get_ve0()->_sysfs_root = &sysfs_root; +#endif +} + static int sysfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; @@ -54,7 +75,7 @@ static int sysfs_fill_super(struct super /* get root inode, initialize and unlock it */ mutex_lock(&sysfs_mutex); - inode = sysfs_get_inode(&sysfs_root); + inode = sysfs_get_inode(ve_sysfs_root); mutex_unlock(&sysfs_mutex); if (!inode) { pr_debug("sysfs: could not get root inode\n"); @@ -68,7 +89,7 @@ static int sysfs_fill_super(struct super iput(inode); return -ENOMEM; } - root->d_fsdata = &sysfs_root; + root->d_fsdata = ve_sysfs_root; sb->s_root = root; return 0; } @@ -79,16 +100,19 @@ static int sysfs_get_sb(struct file_syst return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); } -static struct file_system_type sysfs_fs_type = { +struct file_system_type sysfs_fs_type = { .name = "sysfs", .get_sb = sysfs_get_sb, .kill_sb = kill_anon_super, }; +EXPORT_SYMBOL(sysfs_fs_type); + int __init sysfs_init(void) { int err = -ENOMEM; + init_ve0_sysfs_root(); sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", sizeof(struct sysfs_dirent), 0, 0, NULL); diff -urNp linux-2.6.32.48/fs/sysfs/symlink.c linux-2.6.32.48-openvz/fs/sysfs/symlink.c --- linux-2.6.32.48/fs/sysfs/symlink.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sysfs/symlink.c 2011-11-21 17:40:45.000000000 -0500 @@ -29,10 +29,13 @@ static int sysfs_do_create_link(struct k struct sysfs_addrm_cxt acxt; int error; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!name); if (!kobj) - parent_sd = &sysfs_root; + parent_sd = ve_sysfs_root; else parent_sd = kobj->sd; @@ -115,8 +118,11 @@ void sysfs_remove_link(struct kobject * { struct sysfs_dirent *parent_sd = NULL; + if(!ve_sysfs_alowed()) + return; + if (!kobj) - parent_sd = &sysfs_root; + parent_sd = ve_sysfs_root; else parent_sd = kobj->sd; diff -urNp linux-2.6.32.48/fs/sysfs/sysfs.h linux-2.6.32.48-openvz/fs/sysfs/sysfs.h --- linux-2.6.32.48/fs/sysfs/sysfs.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/sysfs/sysfs.h 2011-11-21 17:40:45.000000000 -0500 @@ -10,74 +10,17 @@ #include -struct sysfs_open_dirent; - -/* type-specific structures for sysfs_dirent->s_* union members */ -struct sysfs_elem_dir { - struct kobject *kobj; - /* children list starts here and goes through sd->s_sibling */ - struct sysfs_dirent *children; -}; - -struct sysfs_elem_symlink { - struct sysfs_dirent *target_sd; -}; - -struct sysfs_elem_attr { - struct attribute *attr; - struct sysfs_open_dirent *open; -}; - -struct sysfs_elem_bin_attr { - struct bin_attribute *bin_attr; - struct hlist_head buffers; -}; - -struct sysfs_inode_attrs { - struct iattr ia_iattr; - void *ia_secdata; - u32 ia_secdata_len; -}; - -/* - * sysfs_dirent - the building block of sysfs hierarchy. Each and - * every sysfs node is represented by single sysfs_dirent. - * - * As long as s_count reference is held, the sysfs_dirent itself is - * accessible. Dereferencing s_elem or any other outer entity - * requires s_active reference. - */ -struct sysfs_dirent { - atomic_t s_count; - atomic_t s_active; - struct sysfs_dirent *s_parent; - struct sysfs_dirent *s_sibling; - const char *s_name; - - union { - struct sysfs_elem_dir s_dir; - struct sysfs_elem_symlink s_symlink; - struct sysfs_elem_attr s_attr; - struct sysfs_elem_bin_attr s_bin_attr; - }; - - unsigned int s_flags; - ino_t s_ino; - umode_t s_mode; - struct sysfs_inode_attrs *s_iattr; -}; - -#define SD_DEACTIVATED_BIAS INT_MIN - -#define SYSFS_TYPE_MASK 0x00ff -#define SYSFS_DIR 0x0001 -#define SYSFS_KOBJ_ATTR 0x0002 -#define SYSFS_KOBJ_BIN_ATTR 0x0004 -#define SYSFS_KOBJ_LINK 0x0008 -#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) - -#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK -#define SYSFS_FLAG_REMOVED 0x0200 +#ifndef CONFIG_VE +extern struct vfsmount *sysfs_mount; +extern struct super_block *sysfs_sb; +#define ve_sysfs_alowed() 1 +#else +#include +#include +#define sysfs_mount (get_exec_env()->sysfs_mnt) +#define sysfs_sb (get_exec_env()->sysfs_sb) +#define ve_sysfs_alowed() (sysfs_sb != NULL) +#endif static inline unsigned int sysfs_type(struct sysfs_dirent *sd) { @@ -97,8 +40,12 @@ struct sysfs_addrm_cxt { /* * mount.c */ +#ifdef CONFIG_VE +#define ve_sysfs_root (get_exec_env()->_sysfs_root) +#else extern struct sysfs_dirent sysfs_root; -extern struct super_block *sysfs_sb; +#define ve_sysfs_root (&sysfs_root) +#endif extern struct kmem_cache *sysfs_dir_cachep; /* diff -urNp linux-2.6.32.48/fs/utimes.c linux-2.6.32.48-openvz/fs/utimes.c --- linux-2.6.32.48/fs/utimes.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/utimes.c 2011-11-21 17:40:45.000000000 -0500 @@ -40,6 +40,20 @@ SYSCALL_DEFINE2(utime, char __user *, fi #endif +SYSCALL_DEFINE2(lutime, char __user *, filename, struct utimbuf __user *, times) +{ + struct timespec tv[2]; + + if (times) { + if (get_user(tv[0].tv_sec, ×->actime) || + get_user(tv[1].tv_sec, ×->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, times ? tv : NULL, AT_SYMLINK_NOFOLLOW); +} + static bool nsec_valid(long nsec) { if (nsec == UTIME_OMIT || nsec == UTIME_NOW) diff -urNp linux-2.6.32.48/fs/xattr.c linux-2.6.32.48-openvz/fs/xattr.c --- linux-2.6.32.48/fs/xattr.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/fs/xattr.c 2011-11-21 17:40:45.000000000 -0500 @@ -115,6 +115,15 @@ vfs_setxattr(struct dentry *dentry, cons struct inode *inode = dentry->d_inode; int error; +#if defined(CONFIG_VE) && defined(CONFIG_SYSCTL) + if (!ve_is_super(get_exec_env())) { + if (ve_xattr_policy == VE_XATTR_POLICY_IGNORE) + return 0; + else if (ve_xattr_policy == VE_XATTR_POLICY_REJECT) + return -EPERM; + } +#endif + error = xattr_permission(inode, name, MAY_WRITE); if (error) return error; diff -urNp linux-2.6.32.48/include/asm-generic/mman.h linux-2.6.32.48-openvz/include/asm-generic/mman.h --- linux-2.6.32.48/include/asm-generic/mman.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/asm-generic/mman.h 2011-11-21 17:40:45.000000000 -0500 @@ -12,6 +12,7 @@ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ +#define MAP_EXECPRIO 0x80000 /* soft ubc charge */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff -urNp linux-2.6.32.48/include/bc/beancounter.h linux-2.6.32.48-openvz/include/bc/beancounter.h --- linux-2.6.32.48/include/bc/beancounter.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/beancounter.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,453 @@ +/* + * include/bc/beancounter.h + * + * Copyright (C) 1999-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Andrey Savochkin saw@sw-soft.com + * + */ + +#ifndef _LINUX_BEANCOUNTER_H +#define _LINUX_BEANCOUNTER_H + +/* + * Generic ratelimiting stuff. + */ + +struct ub_rate_info { + int burst; + int interval; /* jiffy_t per event */ + int bucket; /* kind of leaky bucket */ + unsigned long last; /* last event */ +}; + +/* Return true if rate limit permits. */ +int ub_ratelimit(struct ub_rate_info *); + + +/* + * This magic is used to distinuish user beancounter and pages beancounter + * in struct page. page_ub and page_bc are placed in union and MAGIC + * ensures us that we don't use pbc as ubc in ub_page_uncharge(). + */ +#define UB_MAGIC 0x62756275 + +/* + * Resource list. + */ + +#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including + * struct task, page directories, etc. + */ +#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */ +#define UB_PRIVVMPAGES 2 /* Total number of pages, counting potentially + * private pages as private and used. + */ +#define UB_SHMPAGES 3 /* IPC SHM segment size. */ +#define UB_DUMMY 4 /* Dummy resource (compatibility) */ +#define UB_NUMPROC 5 /* Number of processes. */ +#define UB_PHYSPAGES 6 /* All resident pages, for swapout guarantee. */ +#define UB_VMGUARPAGES 7 /* Guarantee for memory allocation, + * checked against PRIVVMPAGES. + */ +#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill. + * Only limit is used, no accounting. + */ +#define UB_NUMTCPSOCK 9 /* Number of TCP sockets. */ +#define UB_NUMFLOCK 10 /* Number of file locks. */ +#define UB_NUMPTY 11 /* Number of PTYs. */ +#define UB_NUMSIGINFO 12 /* Number of siginfos. */ +#define UB_TCPSNDBUF 13 /* Total size of tcp send buffers. */ +#define UB_TCPRCVBUF 14 /* Total size of tcp receive buffers. */ +#define UB_OTHERSOCKBUF 15 /* Total size of other socket + * send buffers (all buffers for PF_UNIX). + */ +#define UB_DGRAMRCVBUF 16 /* Total size of other socket + * receive buffers. + */ +#define UB_NUMOTHERSOCK 17 /* Number of other sockets. */ +#define UB_DCACHESIZE 18 /* Size of busy dentry/inode cache. */ +#define UB_NUMFILE 19 /* Number of open files. */ + +#define UB_RESOURCES_COMPAT 24 + +/* Add new resources here */ + +#define UB_NUMXTENT 23 +#define UB_SWAPPAGES 24 +#define UB_RESOURCES 25 + +#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0) +#define UB_TMPFSPAGES (UB_RESOURCES + 1) +#define UB_HELDPAGES (UB_RESOURCES + 2) + +struct ubparm { + /* + * A barrier over which resource allocations are failed gracefully. + * If the amount of consumed memory is over the barrier further sbrk() + * or mmap() calls fail, the existing processes are not killed. + */ + unsigned long barrier; + /* hard resource limit */ + unsigned long limit; + /* consumed resources */ + unsigned long held; + /* maximum amount of consumed resources through the last period */ + unsigned long maxheld; + /* minimum amount of consumed resources through the last period */ + unsigned long minheld; + /* count of failed charges */ + unsigned long failcnt; +}; + +/* + * Kernel internal part. + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. + */ +#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) + + +/* + * Resource management structures + * Serialization issues: + * beancounter list management is protected via ub_hash_lock + * task pointers are set only for current task and only once + * refcount is managed atomically + * value and limit comparison and change are protected by per-ub spinlock + */ + +struct page_beancounter; +struct task_beancounter; +struct sock_beancounter; + +struct page_private { + unsigned long ubp_unused_privvmpages; + unsigned long ubp_tmpfs_respages; + unsigned long ubp_pbcs; + unsigned long long ubp_held_pages; +}; + +struct sock_private { + unsigned long ubp_rmem_thres; + unsigned long ubp_wmem_pressure; + unsigned long ubp_maxadvmss; + unsigned long ubp_rmem_pressure; + int ubp_tw_count; +#define UB_RMEM_EXPAND 0 +#define UB_RMEM_KEEP 1 +#define UB_RMEM_SHRINK 2 + struct list_head ubp_other_socks; + struct list_head ubp_tcp_socks; + struct percpu_counter ubp_orphan_count; +}; + +struct ub_percpu_struct { + unsigned long unmap; + unsigned long swapin; +#ifdef CONFIG_BC_IO_ACCOUNTING + unsigned long long bytes_wrote; + unsigned long long bytes_read; + unsigned long long bytes_cancelled; +#endif +#ifdef CONFIG_BC_DEBUG_KMEM + long pages_charged; + long vmalloc_charged; +#endif + unsigned long sync; + unsigned long sync_done; + + unsigned long fsync; + unsigned long fsync_done; + + unsigned long fdsync; + unsigned long fdsync_done; + + unsigned long frsync; + unsigned long frsync_done; + + unsigned long write; + unsigned long read; + unsigned long long wchar; + unsigned long long rchar; +}; + +struct user_beancounter +{ + unsigned long ub_magic; + atomic_t ub_refcount; + struct list_head ub_list; + struct hlist_node ub_hash; + + union { + struct rcu_head rcu; + struct execute_work cleanup; + }; + + spinlock_t ub_lock; + uid_t ub_uid; + unsigned int ub_cookie; + + struct ub_rate_info ub_limit_rl; + int ub_oom_noproc; + + struct page_private ppriv; +#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages +#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages +#define ub_held_pages ppriv.ubp_held_pages +#define ub_pbcs ppriv.ubp_pbcs + struct sock_private spriv; +#define ub_rmem_thres spriv.ubp_rmem_thres +#define ub_maxadvmss spriv.ubp_maxadvmss +#define ub_rmem_pressure spriv.ubp_rmem_pressure +#define ub_wmem_pressure spriv.ubp_wmem_pressure +#define ub_tcp_sk_list spriv.ubp_tcp_socks +#define ub_other_sk_list spriv.ubp_other_socks +#define ub_orphan_count spriv.ubp_orphan_count +#define ub_tw_count spriv.ubp_tw_count + + struct user_beancounter *parent; + int ub_childs; + void *private_data; + unsigned long ub_aflags; + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; +#endif + + /* resources statistic and settings */ + struct ubparm ub_parms[UB_RESOURCES]; + /* resources statistic for last interval */ + struct ubparm ub_store[UB_RESOURCES]; + + struct ub_percpu_struct *ub_percpu; +#ifdef CONFIG_BC_IO_ACCOUNTING + /* these are protected with pb_lock */ + unsigned long long bytes_wrote; + unsigned long long bytes_dirtied; + unsigned long long bytes_dirty_missed; + unsigned long io_pb_held; +#endif +#ifdef CONFIG_BC_DEBUG_KMEM + struct list_head ub_cclist; +#endif +}; + +extern int ub_count; + +enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE }; + +#define UB_AFLAG_NOTIF_PAGEIN 0 + +static inline +struct user_beancounter *top_beancounter(struct user_beancounter *ub) +{ + while (ub->parent != NULL) + ub = ub->parent; + return ub; +} + +static inline int ub_barrier_hit(struct user_beancounter *ub, int resource) +{ + return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier; +} + +static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource) +{ + return (ub->ub_parms[resource].held > + ((ub->ub_parms[resource].barrier) >> 1)); +} + +static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource) +{ + struct ubparm *p; + p = ub->ub_parms + resource; + return p->held <= (p->barrier >> 3); +} + +static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource) +{ + struct ubparm *p; + p = ub->ub_parms + resource; + return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024; +} + +#ifndef CONFIG_BEANCOUNTERS + +#define ub_percpu_add(ub, f, v) do { } while (0) +#define ub_percpu_sub(ub, f, v) do { } while (0) +#define ub_percpu_inc(ub, f) do { } while (0) +#define ub_percpu_dec(ub, f) do { } while (0) + +#define mm_ub(mm) (NULL) + +extern inline struct user_beancounter *get_beancounter_byuid + (uid_t uid, int create) { return NULL; } +extern inline struct user_beancounter *get_beancounter + (struct user_beancounter *ub) { return NULL; } +extern inline void put_beancounter(struct user_beancounter *ub) { } + +static inline void ub_init_late(void) { }; +static inline void ub_init_early(void) { }; + +static inline int charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, + enum ub_severity strict) { return 0; } +static inline void uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) { } + +#else /* CONFIG_BEANCOUNTERS */ + +#define ub_percpu_add(ub, field, v) do { \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1) + +#define ub_percpu_sub(ub, field, v) do { \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1) + +#define mm_ub(mm) ((mm)->mm_ub) +/* + * Charge/uncharge operations + */ + +extern int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict); + +extern void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val); + +extern void put_beancounter_safe(struct user_beancounter *ub); +extern void __put_beancounter(struct user_beancounter *ub); + +extern void uncharge_warn(struct user_beancounter *ub, int resource, + unsigned long val, unsigned long held); + +extern const char *ub_rnames[]; +/* + * Put a beancounter reference + */ + +static inline void put_beancounter(struct user_beancounter *ub) +{ + if (unlikely(ub == NULL)) + return; + + /* FIXME - optimize not to disable interrupts and make call */ + __put_beancounter(ub); +} + +/* fast put, refcount can't reach zero */ +static inline void __put_beancounter_batch(struct user_beancounter *ub, int n) +{ + atomic_sub(n, &ub->ub_refcount); +} + +static inline void put_beancounter_batch(struct user_beancounter *ub, int n) +{ + if (n > 1) + __put_beancounter_batch(ub, n - 1); + __put_beancounter(ub); +} + +/* + * Create a new beancounter reference + */ +extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create); + +static inline +struct user_beancounter *get_beancounter(struct user_beancounter *ub) +{ + if (unlikely(ub == NULL)) + return NULL; + + atomic_inc(&ub->ub_refcount); + return ub; +} + +static inline +struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub) +{ + return atomic_inc_not_zero(&ub->ub_refcount) ? ub : NULL; +} + +static inline void get_beancounter_batch(struct user_beancounter *ub, int n) +{ + atomic_add(n, &ub->ub_refcount); +} + +extern struct user_beancounter *get_subbeancounter_byid( + struct user_beancounter *, + int id, int create); + +extern void ub_init_late(void); +extern void ub_init_early(void); + +extern int print_ub_uid(struct user_beancounter *ub, char *buf, int size); + +/* + * Resource charging + * Change user's account and compare against limits + */ + +static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource) +{ + if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held) + ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held; + if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held) + ub->ub_parms[resource].minheld = ub->ub_parms[resource].held; +} + +int charge_beancounter(struct user_beancounter *ub, int resource, + unsigned long val, enum ub_severity strict); +void uncharge_beancounter(struct user_beancounter *ub, int resource, + unsigned long val); +void __charge_beancounter_notop(struct user_beancounter *ub, int resource, + unsigned long val); +void __uncharge_beancounter_notop(struct user_beancounter *ub, int resource, + unsigned long val); + +static inline void charge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + if (ub->parent != NULL) + __charge_beancounter_notop(ub, resource, val); +} + +static inline void uncharge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + if (ub->parent != NULL) + __uncharge_beancounter_notop(ub, resource, val); +} + +#endif /* CONFIG_BEANCOUNTERS */ + +#ifndef CONFIG_BC_RSS_ACCOUNTING +static inline void ub_ini_pbc(void) { } +#else +extern void ub_init_pbc(void); +#endif +#endif /* __KERNEL__ */ +#endif /* _LINUX_BEANCOUNTER_H */ diff -urNp linux-2.6.32.48/include/bc/dcache.h linux-2.6.32.48-openvz/include/bc/dcache.h --- linux-2.6.32.48/include/bc/dcache.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/dcache.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,47 @@ +/* + * include/bc/dcache.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DCACHE_H_ +#define __BC_DCACHE_H_ + +#include + +/* + * UB_DCACHESIZE accounting + */ + +struct dentry_beancounter +{ + /* + * d_inuse = + * + + * + * + * d_inuse == -1 means that dentry is unused + * state change -1 => 0 causes charge + * state change 0 => -1 causes uncharge + */ + atomic_t d_inuse; + /* charged size, including name length if name is not inline */ + unsigned long d_ubsize; + struct user_beancounter *d_ub; +}; + +#ifdef CONFIG_BEANCOUNTERS +#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse)) +#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse)) +#define INUSE_INIT 0 + +extern int ub_dentry_on; +#else +#define ub_dget_testone(d) (0) +#define ub_dput_testzero(d) (0) +#endif +#endif diff -urNp linux-2.6.32.48/include/bc/dcache_op.h linux-2.6.32.48-openvz/include/bc/dcache_op.h --- linux-2.6.32.48/include/bc/dcache_op.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/dcache_op.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,102 @@ +/* + * include/bc/dcache_op.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DCACHE_OP_H_ +#define __BC_DCACHE_OP_H_ + +struct dentry; + +#ifdef CONFIG_BEANCOUNTERS + +#include +#include +#include + +extern int ub_dentry_alloc_barrier; +extern spinlock_t dcache_lock; + +static inline int ub_dentry_alloc(struct dentry *d) +{ + extern int __ub_dentry_alloc(struct dentry *); + + if (!ub_dentry_on) + return 0; + return __ub_dentry_alloc(d); +} + +static inline void ub_dentry_alloc_start(void) +{ + extern void __ub_dentry_alloc_start(void); + + if (ub_dentry_alloc_barrier) + __ub_dentry_alloc_start(); +} + +static inline void ub_dentry_alloc_end(void) +{ + extern void __ub_dentry_alloc_end(void); + + if (current->task_bc.dentry_alloc) + __ub_dentry_alloc_end(); +} + +static inline int ub_dentry_charge(struct dentry *d) +{ + extern int __ub_dentry_charge(struct dentry *); + + if (!ub_dentry_on) + return 0; + return __ub_dentry_charge(d); +} + +static inline void ub_dentry_charge_nofail(struct dentry *d) +{ + extern void __ub_dentry_charge_nofail(struct dentry *); + + if (!ub_dentry_on) + return; + __ub_dentry_charge_nofail(d); +} + +static inline void ub_dentry_uncharge_locked(struct dentry *d) +{ + extern void __ub_dentry_uncharge(struct dentry *); + + if (!ub_dentry_on) + return; + __ub_dentry_uncharge(d); +} + +static inline void ub_dentry_uncharge(struct dentry *d) +{ + extern void __ub_dentry_uncharge(struct dentry *); + + if (!ub_dentry_on) + return; + spin_lock(&dcache_lock); + __ub_dentry_uncharge(d); + spin_unlock(&dcache_lock); +} + +void uncharge_dcache(struct user_beancounter *ub, unsigned long size); +#else /* CONFIG_BEANCOUNTERS */ + +static inline int ub_dentry_alloc(struct dentry *d) { return 0; } +static inline void ub_dentry_alloc_start(void) { } +static inline void ub_dentry_alloc_end(void) { } +static inline int ub_dentry_charge(struct dentry *d) { return 0; } +static inline void ub_dentry_charge_nofail(struct dentry *d) { } +static inline void ub_dentry_uncharge_locked(struct dentry *d) { } +static inline void ub_dentry_uncharge(struct dentry *d) { } +static inline void uncharge_dcache(struct user_beancounter *ub, unsigned long size) { } + +#endif /* CONFIG_BEANCOUNTERS */ + +#endif /* __dcache_op.h_ */ diff -urNp linux-2.6.32.48/include/bc/debug.h linux-2.6.32.48-openvz/include/bc/debug.h --- linux-2.6.32.48/include/bc/debug.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/debug.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,103 @@ +/* + * include/bc/debug.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DEBUG_H_ +#define __BC_DEBUG_H_ + +/* + * general debugging + */ + +#define UBD_ALLOC 0x1 +#define UBD_CHARGE 0x2 +#define UBD_LIMIT 0x4 +#define UBD_TRACE 0x8 + +/* + * ub_net debugging + */ + +#define UBD_NET_SOCKET 0x10 +#define UBD_NET_SLEEP 0x20 +#define UBD_NET_SEND 0x40 +#define UBD_NET_RECV 0x80 + +/* + * Main routines + */ + +#define UB_DEBUG (0) +#define DEBUG_RESOURCE (0ULL) + +#define ub_dbg_cond(__cond, __str, args...) \ + do { \ + if ((__cond) != 0) \ + printk(__str, ##args); \ + } while(0) + +#define ub_debug(__section, __str, args...) \ + ub_dbg_cond(UB_DEBUG & (__section), __str, ##args) + +#define ub_debug_resource(__resource, __str, args...) \ + ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \ + (DEBUG_RESOURCE & (1 << (__resource))), \ + __str, ##args) + +#if UB_DEBUG & UBD_TRACE +#define ub_debug_trace(__cond, __b, __r) \ + do { \ + static struct ub_rate_info ri = { __b, __r }; \ + if ((__cond) != 0 && ub_ratelimit(&ri)) \ + dump_stack(); \ + } while(0) +#else +#define ub_debug_trace(__cond, __burst, __rate) +#endif + +#ifdef CONFIG_BC_DEBUG_KMEM +#include + +struct user_beancounter; +struct ub_cache_counter { + struct list_head ulist; + struct ub_cache_counter *next; + struct user_beancounter *ub; + struct kmem_cache *cachep; + unsigned long counter; +}; + +extern spinlock_t cc_lock; +extern void init_cache_counters(void); +extern void ub_free_counters(struct user_beancounter *); +extern void ub_kmemcache_free(struct kmem_cache *cachep); + +struct vm_struct; +#define inc_vmalloc_charged(vm, flags) do { \ + if (flags & __GFP_UBC) \ + ub_percpu_add(get_exec_ub(), vmalloc_charged, \ + vm->nr_pages); \ + } while (0) +#define dec_vmalloc_charged(vm) do { \ + struct user_beancounter *ub; \ + ub = page_ub(vm->pages[0]); \ + if (ub != NULL) \ + ub_percpu_sub(ub, vmalloc_charged, \ + vm->nr_pages); \ + } while (0) +#else +#define init_cache_counters() do { } while (0) +#define inc_vmalloc_charged(vm, f) do { } while (0) +#define dec_vmalloc_charged(vm) do { } while (0) + +#define ub_free_counters(ub) do { } while (0) +#define ub_kmemcache_free(cachep) do { } while (0) +#endif + +#endif diff -urNp linux-2.6.32.48/include/bc/decl.h linux-2.6.32.48-openvz/include/bc/decl.h --- linux-2.6.32.48/include/bc/decl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/decl.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,41 @@ +/* + * include/bc/decl.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DECL_H_ +#define __BC_DECL_H_ + +#ifdef __KERNEL__ + +/* + * Naming convension: + * ub__ + */ + +#ifdef CONFIG_BEANCOUNTERS + +#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl; +#define UB_DECLARE_VOID_FUNC(decl) extern void decl; + +#else /* CONFIG_BEANCOUNTERS */ + +#define UB_DECLARE_FUNC(ret_type, decl) \ + static inline ret_type decl \ + { \ + return (ret_type)0; \ + } +#define UB_DECLARE_VOID_FUNC(decl) \ + static inline void decl \ + { \ + } + +#endif /* CONFIG_BEANCOUNTERS */ +#endif + +#endif diff -urNp linux-2.6.32.48/include/bc/hash.h linux-2.6.32.48-openvz/include/bc/hash.h --- linux-2.6.32.48/include/bc/hash.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/hash.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,36 @@ +/* + * include/bc/hash.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_UBHASH_H +#define _LINUX_UBHASH_H + +#ifdef __KERNEL__ + +#define UB_HASH_SIZE 256 + +extern struct hlist_head ub_hash[]; +extern spinlock_t ub_hash_lock; +extern struct list_head ub_list_head; + +#ifdef CONFIG_BEANCOUNTERS + +/* + * Iterate over beancounters + * @__ubp - beancounter ptr + * Can use break :) + */ +#define for_each_beancounter(__ubp) \ + list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list) \ + +#define bc_hash_entry(ptr) hlist_entry(ptr, struct user_beancounter, ub_hash) + +#endif /* CONFIG_BEANCOUNTERS */ +#endif /* __KERNEL__ */ +#endif /* _LINUX_UBHASH_H */ diff -urNp linux-2.6.32.48/include/bc/io_acct.h linux-2.6.32.48-openvz/include/bc/io_acct.h --- linux-2.6.32.48/include/bc/io_acct.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/io_acct.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,113 @@ +/* + * include/bc/io_acct.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Pavel Emelianov + * + */ + +#ifndef __UB_IO_ACCT_H_ +#define __UB_IO_ACCT_H_ + +#define PAGE_IO_MARK (0x1UL) + +#ifdef CONFIG_BC_IO_ACCOUNTING +#include +#include + +#define page_iopb(page) ({ \ + struct page_beancounter *pb; \ + pb = page_pbc(page); \ + rmb(); \ + pb; \ + }) + +/* + * IO ub is required in task context only, so if exec_ub is set + * to NULL this means that uses doesn't need to charge some + * resources. nevertheless IO activity must be accounted, so we + * account it to current's task beancounter. + */ + +static inline struct user_beancounter *get_io_ub(void) +{ + struct user_beancounter *ub; + + ub = get_exec_ub(); + if (unlikely(ub == NULL)) + ub = get_task_ub(current); + + return top_beancounter(ub); +} + +extern struct page_beancounter **page_pblist(struct page *); + +extern void ub_io_save_context(struct page *, size_t); +extern void ub_io_release_context(struct page *pg, size_t size); + +static inline struct page_beancounter *iopb_to_pb(struct page_beancounter *pb) +{ + if (!((unsigned long)pb & PAGE_IO_MARK)) + return NULL; + + return (struct page_beancounter *)((unsigned long)pb & ~PAGE_IO_MARK); +} + +static inline void ub_io_account_read(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_read, bytes); +} + +static inline void ub_io_account_write(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_wrote, bytes); +} + +static inline void ub_io_account_dirty(struct page *page, size_t bytes) +{ + ub_io_save_context(page, bytes); +} + +static inline void ub_io_account_write_cancelled(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_cancelled, bytes); +} + +void ub_init_io(struct kmem_cache *); +#else /* BC_IO_ACCOUNTING */ +#define page_iopb(page) (NULL) +#define page_pblist(page) (&page_pbc(page)) + +static inline void ub_io_release_context(struct page *pg, size_t bytes) +{ +} + +static inline void ub_io_account_dirty(struct page *p, size_t bytes) +{ +} + +static inline void ub_io_account_read(size_t bytes) +{ +} + +static inline void ub_io_account_write(size_t bytes) +{ +} + +static inline void ub_io_account_write_cancelled(size_t bytes) +{ +} + +static inline void ub_init_io(struct kmem_cache *pb_cachep) { }; +#endif + +#ifdef CONFIG_BC_DEBUG_IO +extern void ub_io_release_debug(struct page *pg); +#else +#define ub_io_release_debug(pg) do { } while (0) +#endif +#endif diff -urNp linux-2.6.32.48/include/bc/kmem.h linux-2.6.32.48-openvz/include/bc/kmem.h --- linux-2.6.32.48/include/bc/kmem.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/kmem.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,69 @@ +/* + * include/bc/kmem.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_SLAB_H_ +#define __UB_SLAB_H_ + +#include +#include + +/* + * UB_KMEMSIZE accounting + */ + +#ifdef CONFIG_BC_DEBUG_ITEMS +#define CHARGE_ORDER(__o) (1 << (__o)) +#define CHARGE_SIZE(__s) 1 +#else +#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o)) +#define CHARGE_SIZE(__s) (__s) +#endif + +#ifdef CONFIG_BEANCOUNTERS +#define page_ub(__page) ((__page)->bc.page_ub) +#else +#define page_ub(__page) NULL +#endif + +struct mm_struct; +struct page; +struct kmem_cache; + +UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj)) +UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj)) + +UB_DECLARE_FUNC(int, ub_kmemsize_charge(struct user_beancounter *ub, + unsigned long size, enum ub_severity strict)) +UB_DECLARE_VOID_FUNC(ub_kmemsize_uncharge(struct user_beancounter *ub, + unsigned long size)) + +UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, gfp_t mask)) +UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order)) +UB_DECLARE_FUNC(int, ub_slab_charge(struct kmem_cache *cachep, + void *objp, gfp_t flags)) +UB_DECLARE_VOID_FUNC(ub_slab_uncharge(struct kmem_cache *cachep, void *obj)) + +#ifdef CONFIG_BEANCOUNTERS +static inline int should_charge(unsigned long cflags, gfp_t flags) +{ + if (!(cflags & SLAB_UBC)) + return 0; + if ((cflags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC)) + return 0; + return 1; +} + +#define should_uncharge(cflags) should_charge(cflags, __GFP_UBC) +#else +#define should_charge(cflags, f) 0 +#define should_uncharge(cflags) 0 +#endif + +#endif /* __UB_SLAB_H_ */ diff -urNp linux-2.6.32.48/include/bc/misc.h linux-2.6.32.48-openvz/include/bc/misc.h --- linux-2.6.32.48/include/bc/misc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/misc.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,55 @@ +/* + * include/bc/misc.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_MISC_H_ +#define __BC_MISC_H_ + +#include + +struct tty_struct; +struct file; +struct file_lock; +struct sigqueue; + +UB_DECLARE_FUNC(int, ub_file_charge(struct file *f)) +UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f)) +UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard)) +UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl)) +UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q, + struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q)) +UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent, + struct task_struct *task)) +UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task)) +UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task)) +UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty)) +UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty)) + +#ifdef CONFIG_BEANCOUNTERS +#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0) +#define unset_flock_charged(fl) do { \ + WARN_ON((fl)->fl_charged == 0); \ + (fl)->fl_charged = 0; \ + } while (0) +#define set_mm_ub(mm, tsk) do { \ + (mm)->mm_ub = get_beancounter(tsk != current ? \ + tsk->task_bc.task_ub : get_exec_ub()); \ + } while (0) +#define put_mm_ub(mm) do { \ + put_beancounter((mm)->mm_ub); \ + (mm)->mm_ub = NULL; \ + } while (0) +#else +#define set_flock_charged(fl) do { } while (0) +#define unset_flock_charged(fl) do { } while (0) +#define set_mm_ub(mm, tsk) do { } while (0) +#define put_mm_ub(mm) do { } while (0) +#endif +#endif diff -urNp linux-2.6.32.48/include/bc/net.h linux-2.6.32.48-openvz/include/bc/net.h --- linux-2.6.32.48/include/bc/net.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/net.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,213 @@ +/* + * include/bc/net.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_NET_H_ +#define __BC_NET_H_ + +/* + * UB_NUMXXXSOCK, UB_XXXBUF accounting + */ + +#include +#include +#include + +#define bid2sid(__bufid) \ + ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK) + +#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \ + ~(SMP_CACHE_BYTES-1))) +#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE) + +static inline int ub_skb_alloc_bc(struct sk_buff *skb, gfp_t gfp_mask) +{ +#ifdef CONFIG_BEANCOUNTERS + memset(skb_bc(skb), 0, sizeof(struct skb_beancounter)); +#endif + return 0; +} + +static inline void ub_skb_free_bc(struct sk_buff *skb) +{ +} + +#define IS_TCP_SOCK(__family, __type) \ + (((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM) + +/* number of sockets */ +UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type)) +UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) +UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk)) +UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk)) + +/* management of queue for send space */ +UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_sock_snd_queue_add(struct sock *sk, int resource, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk)) + +/* send space */ +UB_DECLARE_FUNC(int, ub_sock_make_wreserv(struct sock *sk, int bufid, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_sock_get_wreserv(struct sock *sk, int bufid, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargesend(struct sock *sk, + struct sk_buff *skb, enum ub_severity strict)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk)) +UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk)) + +UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)) + +/* receive space */ +UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargerecv(struct sock *sk, + struct sk_buff *skb, enum ub_severity strict)) + +/* skb destructor */ +UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb)) + +static inline int ub_sock_makewres_other(struct sock *sk, unsigned long size) +{ + return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size); +} + +static inline int ub_sock_makewres_tcp(struct sock *sk, unsigned long size) +{ + return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size); +} + +UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, + unsigned long size)) + +static inline int ub_sock_getwres_tcp(struct sock *sk, unsigned long size) +{ + return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size); +} + +UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, + unsigned long size, unsigned long ressize)) + +static inline void ub_sock_retwres_tcp(struct sock *sk, unsigned long size, + unsigned long ressize) +{ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize); +} + +static inline int ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz) +{ + return ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz); +} + +static inline int ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz) +{ + return ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz); +} + +static inline int ub_tcpsndbuf_charge(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargesend(sk, skb, UB_HARD); +} + +static inline int ub_tcpsndbuf_charge_forced(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargesend(sk, skb, UB_FORCE); +} + +static inline int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + return ub_sock_tcp_chargerecv(sk, skb, UB_SOFT); +} + +static inline int ub_tcprcvbuf_charge_forced(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargerecv(sk, skb, UB_FORCE); +} + +/* Charge size */ +static inline unsigned long skb_charge_datalen(unsigned long chargesize) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned long slabsize; + + chargesize -= sizeof(struct sk_buff); + slabsize = 64; + do { + slabsize <<= 1; + } while (slabsize <= chargesize); + + slabsize >>= 1; + return (slabsize - sizeof(struct skb_shared_info)) & + ~(SMP_CACHE_BYTES-1); +#else + return 0; +#endif +} + +static inline unsigned long skb_charge_size_gen(unsigned long size) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned int slabsize; + + size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info); + slabsize = 32; /* min size is 64 because of skb_shared_info */ + do { + slabsize <<= 1; + } while (slabsize < size); + + return slabsize + sizeof(struct sk_buff); +#else + return 0; +#endif + +} + +static inline unsigned long skb_charge_size_const(unsigned long size) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned int ret; + if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64) + ret = 64 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128) + ret = 128 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256) + ret = 256 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512) + ret = 512 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024) + ret = 1024 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048) + ret = 2048 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096) + ret = 4096 + sizeof(struct sk_buff); + else + ret = skb_charge_size_gen(size); + return ret; +#else + return 0; +#endif +} + + +#define skb_charge_size(__size) \ + (__builtin_constant_p(__size) ? \ + skb_charge_size_const(__size) : \ + skb_charge_size_gen(__size)) + +UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb)) +UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, + struct sock *sk, unsigned long size, int res)) + +#endif diff -urNp linux-2.6.32.48/include/bc/oom_kill.h linux-2.6.32.48-openvz/include/bc/oom_kill.h --- linux-2.6.32.48/include/bc/oom_kill.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/oom_kill.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,26 @@ +#include +#include + +UB_DECLARE_FUNC(int, ub_oom_lock(void)) +UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void)) +UB_DECLARE_VOID_FUNC(ub_oom_mm_killed(struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_oom_unlock(void)) +UB_DECLARE_VOID_FUNC(ub_out_of_memory(struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_oom_task_dead(struct task_struct *tsk)) +UB_DECLARE_FUNC(int, ub_oom_task_skip(struct user_beancounter *ub, + struct task_struct *tsk)) + +#ifdef CONFIG_BEANCOUNTERS +extern int oom_generation; +extern int oom_kill_counter; +#define ub_oom_start() do { \ + current->task_bc.oom_generation = oom_generation; \ + } while (0) +#define ub_oom_task_killed(p) do { \ + oom_kill_counter++; \ + wake_up_process(p); \ + } while (0) +#else +#define ub_oom_start() do { } while (0) +#define ub_oom_task_killed(p) do { } while (0) +#endif diff -urNp linux-2.6.32.48/include/bc/proc.h linux-2.6.32.48-openvz/include/bc/proc.h --- linux-2.6.32.48/include/bc/proc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/proc.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,40 @@ +/* + * include/bc/proc.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PROC_H_ +#define __UB_PROC_H_ + +#include + +struct bc_proc_entry { + char *name; + union { + int (*show)(struct seq_file *, void *); + struct file_operations *fops; + } u; + struct bc_proc_entry *next; + int cookie; +}; + +struct user_beancounter; + +void bc_register_proc_entry(struct bc_proc_entry *); +void bc_register_proc_root_entry(struct bc_proc_entry *); + +static inline struct user_beancounter *seq_beancounter(struct seq_file *f) +{ + return (struct user_beancounter *)(f->private); +} + +extern const char *bc_proc_lu_fmt; +extern const char *bc_proc_lu_lfmt; +extern const char *bc_proc_llu_fmt; +extern const char *bc_proc_lu_lu_fmt; +#endif diff -urNp linux-2.6.32.48/include/bc/rss_pages.h linux-2.6.32.48-openvz/include/bc/rss_pages.h --- linux-2.6.32.48/include/bc/rss_pages.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/rss_pages.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,57 @@ +/* + * include/bc/rss_pages.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __RSS_PAGES_H_ +#define __RSS_PAGES_H_ + +/* + * Page_beancounters + */ + +struct page; +struct user_beancounter; + +#define PB_MAGIC 0x62700001UL + +struct page_beancounter { + unsigned long pb_magic; + struct page *page; + struct user_beancounter *ub; + union { + struct page_beancounter *next_hash; + struct page_beancounter *page_pb_list; + }; + union { + unsigned refcount; + unsigned io_debug; + }; + union { + struct list_head page_list; + struct list_head io_list; + }; +}; + +#define PB_REFCOUNT_BITS 24 +#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS) +#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS)) +#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS)) +#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1)) +#define PB_COUNT_INC(c) ((c)++) +#define PB_COUNT_DEC(c) ((c)--) +#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c)) + +#define page_pbc(__page) ((__page)->bc.page_pb) + +extern spinlock_t pb_lock; + +struct address_space; +extern int is_shmem_mapping(struct address_space *); + +#endif diff -urNp linux-2.6.32.48/include/bc/sock.h linux-2.6.32.48-openvz/include/bc/sock.h --- linux-2.6.32.48/include/bc/sock.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/sock.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,47 @@ +/* + * include/bc/sock.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_SOCK_H_ +#define __BC_SOCK_H_ + +#include + +struct sock; +struct sk_buff; + +struct skb_beancounter { + struct user_beancounter *ub; + unsigned long charged:27, resource:5; +}; + +struct sock_beancounter { + struct user_beancounter *ub; + /* + * poll_reserv accounts space already charged for future sends. + * It is required to make poll agree with sendmsg. + * Additionally, it makes real charges (with taking bc spinlock) + * in the send path rarer, speeding networking up. + * For TCP (only): changes are protected by socket lock (not bc!) + * For all proto: may be read without serialization in poll. + */ + unsigned long poll_reserv; + unsigned long forw_space; + /* fields below are protected by bc spinlock */ + unsigned long ub_waitspc; /* space waiting for */ + unsigned long ub_wcharged; + struct list_head ub_sock_list; +}; + +#define sock_bc(__sk) (&(__sk)->sk_bc) +#define skb_bc(__skb) (&(__skb)->skb_bc) +#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc)) +#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL) + +#endif diff -urNp linux-2.6.32.48/include/bc/sock_orphan.h linux-2.6.32.48-openvz/include/bc/sock_orphan.h --- linux-2.6.32.48/include/bc/sock_orphan.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/sock_orphan.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,98 @@ +/* + * include/bc/sock_orphan.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_SOCK_ORPHAN_H_ +#define __BC_SOCK_ORPHAN_H_ + +#include + +#include "bc/beancounter.h" +#include "bc/net.h" + + +static inline struct percpu_counter *__ub_get_orphan_count_ptr(struct sock *sk) +{ + if (sock_has_ubc(sk)) { + struct user_beancounter *ub; + + ub = top_beancounter(sock_bc(sk)->ub); + return &ub->ub_orphan_count; + } else + return sk->sk_prot->orphan_count; +} + +static inline void ub_inc_orphan_count(struct sock *sk) +{ + percpu_counter_inc(__ub_get_orphan_count_ptr(sk)); +} + +static inline void ub_dec_orphan_count(struct sock *sk) +{ + percpu_counter_dec(__ub_get_orphan_count_ptr(sk)); +} + +static inline int ub_get_orphan_count(struct sock *sk) +{ + return percpu_counter_sum_positive(__ub_get_orphan_count_ptr(sk)); +} + +extern int ub_too_many_orphans(struct sock *sk, int count); + +#include + +struct inet_timewait_sock; + +static inline void ub_timewait_mod(struct inet_timewait_sock *tw, int incdec) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + + ub = slab_ub(tw); + if (ub != NULL) + ub->ub_tw_count += incdec; +#endif +} + +static inline int __ub_timewait_check(struct sock *sk) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + unsigned long mem_max, mem; + int tw_count; + + ub = sock_bc(sk)->ub; + if (ub == NULL) + return 1; + + tw_count = ub->ub_tw_count; + mem_max = sysctl_tcp_max_tw_kmem_fraction * + ((ub->ub_parms[UB_KMEMSIZE].limit >> 10) + 1); + mem = kmem_cache_objuse(sk->sk_prot_creator->twsk_prot->twsk_slab); + mem *= tw_count; + return tw_count < sysctl_tcp_max_tw_buckets_ub && mem < mem_max; +#else + return 1; +#endif +} + +#define ub_timewait_inc(tw, twdr) do { \ + if ((twdr)->ub_managed) \ + ub_timewait_mod(tw, 1); \ + } while (0) + +#define ub_timewait_dec(tw, twdr) do { \ + if ((twdr)->ub_managed) \ + ub_timewait_mod(tw, -1); \ + } while (0) + +#define ub_timewait_check(sk, twdr) ((!(twdr)->ub_managed) || \ + __ub_timewait_check(sk)) + +#endif diff -urNp linux-2.6.32.48/include/bc/statd.h linux-2.6.32.48-openvz/include/bc/statd.h --- linux-2.6.32.48/include/bc/statd.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/statd.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,70 @@ +/* + * include/bc/statd.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_STATD_H_ +#define __BC_STATD_H_ + +/* sys_ubstat commands list */ +#define UBSTAT_READ_ONE 0x010000 +#define UBSTAT_READ_ALL 0x020000 +#define UBSTAT_READ_FULL 0x030000 +#define UBSTAT_UBLIST 0x040000 +#define UBSTAT_UBPARMNUM 0x050000 +#define UBSTAT_GETTIME 0x060000 + +#define UBSTAT_CMD(func) ((func) & 0xF0000) +#define UBSTAT_PARMID(func) ((func) & 0x0FFFF) + +#define TIME_MAX_SEC (LONG_MAX / HZ) +#define TIME_MAX_JIF (TIME_MAX_SEC * HZ) + +typedef unsigned long ubstattime_t; + +typedef struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstattime_t cur_time; +} ubgettime_t; + +typedef struct { + long maxinterval; + int signum; +} ubnotifrq_t; + +typedef struct { + unsigned long maxheld; + unsigned long failcnt; +} ubstatparm_t; + +typedef struct { + unsigned long barrier; + unsigned long limit; + unsigned long held; + unsigned long maxheld; + unsigned long minheld; + unsigned long failcnt; + unsigned long __unused1; + unsigned long __unused2; +} ubstatparmf_t; + +typedef struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparmf_t param[0]; +} ubstatfull_t; + +#ifdef __KERNEL__ +struct ub_stat_notify { + struct list_head list; + struct task_struct *task; + int signum; +}; +#endif +#endif diff -urNp linux-2.6.32.48/include/bc/task.h linux-2.6.32.48-openvz/include/bc/task.h --- linux-2.6.32.48/include/bc/task.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/task.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,69 @@ +/* + * include/bc/task.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_TASK_H_ +#define __BC_TASK_H_ + +struct user_beancounter; + + +#ifdef CONFIG_BEANCOUNTERS +struct task_beancounter { + struct user_beancounter *exec_ub; + struct user_beancounter *saved_ub; + struct user_beancounter *task_ub; + struct user_beancounter *fork_sub; + unsigned long file_precharged, file_quant, file_count; + unsigned long kmem_precharged; + char dentry_alloc, pgfault_handle; + void *task_fnode, *task_freserv; + unsigned long oom_generation; + unsigned long task_data[4]; + unsigned long pgfault_allot; +}; + +#define get_task_ub(__task) ((__task)->task_bc.task_ub) + +extern struct user_beancounter ub0; +#define get_ub0() (&ub0) + +#define ub_save_context(t) do { \ + t->task_bc.saved_ub = t->task_bc.exec_ub; \ + t->task_bc.exec_ub = get_ub0(); \ + } while (0) +#define ub_restore_context(t) do { \ + t->task_bc.exec_ub = t->task_bc.saved_ub; \ + } while (0) + +#define get_exec_ub() (current->task_bc.exec_ub) +#define set_exec_ub(__newub) \ +({ \ + struct user_beancounter *old; \ + struct task_beancounter *tbc; \ + \ + tbc = ¤t->task_bc; \ + old = tbc->exec_ub; \ + tbc->exec_ub = __newub; \ + old; \ +}) + +void ub_init_task_bc(struct task_beancounter *); + +#else /* CONFIG_BEANCOUNTERS */ + +#define get_ub0() (NULL) +#define get_exec_ub() (NULL) +#define get_task_ub(task) (NULL) +#define set_exec_ub(__ub) (NULL) +#define ub_save_context(t) do { } while (0) +#define ub_restore_context(t) do { } while (0) + +#endif /* CONFIG_BEANCOUNTERS */ +#endif /* __task.h_ */ diff -urNp linux-2.6.32.48/include/bc/tcp.h linux-2.6.32.48-openvz/include/bc/tcp.h --- linux-2.6.32.48/include/bc/tcp.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/tcp.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,76 @@ +/* + * include/bc/tcp.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_TCP_H_ +#define __BC_TCP_H_ + +/* + * UB_NUMXXXSOCK, UB_XXXBUF accounting + */ + +#include +#include + +static inline void ub_tcp_update_maxadvmss(struct sock *sk) +{ +#ifdef CONFIG_BEANCOUNTERS + if (!sock_has_ubc(sk)) + return; + if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss) + return; + + sock_bc(sk)->ub->ub_maxadvmss = + skb_charge_size(MAX_HEADER + sizeof(struct iphdr) + + sizeof(struct tcphdr) + tcp_sk(sk)->advmss); +#endif +} + +static inline int ub_tcp_rmem_allows_expand(struct sock *sk) +{ + if (tcp_memory_pressure) + return 0; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) { + struct user_beancounter *ub; + + ub = sock_bc(sk)->ub; + if (ub->ub_rmem_pressure == UB_RMEM_EXPAND) + return 1; + if (ub->ub_rmem_pressure == UB_RMEM_SHRINK) + return 0; + return sk->sk_rcvbuf <= ub->ub_rmem_thres; + } +#endif + return 1; +} + +static inline int ub_tcp_memory_pressure(struct sock *sk) +{ + if (tcp_memory_pressure) + return 1; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND; +#endif + return 0; +} + +static inline int ub_tcp_shrink_rcvbuf(struct sock *sk) +{ + if (tcp_memory_pressure) + return 1; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK; +#endif + return 0; +} + +#endif diff -urNp linux-2.6.32.48/include/bc/vmpages.h linux-2.6.32.48-openvz/include/bc/vmpages.h --- linux-2.6.32.48/include/bc/vmpages.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/bc/vmpages.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,152 @@ +/* + * include/bc/vmpages.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PAGES_H_ +#define __UB_PAGES_H_ + +#include +#include +#include + +/* + * Check whether vma has private or copy-on-write mapping. + * Should match checks in ub_protected_charge(). + */ +#define VM_UB_PRIVATE(__flags, __file) \ + ( ((__flags) & VM_WRITE) ? \ + (__file) == NULL || !((__flags) & VM_SHARED) : \ + 0 \ + ) + +/* Mprotect charging result */ +#define PRIVVM_ERROR -1 +#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */ +#define PRIVVM_TO_PRIVATE 1 +#define PRIVVM_TO_SHARED 2 + +UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm, + unsigned long size, + unsigned long newflags, + struct vm_area_struct *vma)) + +UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long num)) +#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1) +UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long num)) +#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1) + +UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm, + long sz)) + +UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm, + unsigned long size, + unsigned vm_flags, + struct file *vm_file, + int strict)) +UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm, + unsigned long size, + unsigned vm_flags, + struct file *vm_file)) + +struct shmem_inode_info; +UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i, + unsigned long sz)) +UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i, + unsigned long sz)) +UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi)) +UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi, + unsigned long size)) +#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1) + +#ifdef CONFIG_BEANCOUNTERS +#define shmi_ub_set(shi, ub) do { \ + (shi)->shmi_ub = get_beancounter(ub); \ + } while (0) +#define shmi_ub_put(shi) do { \ + put_beancounter((shi)->shmi_ub); \ + (shi)->shmi_ub = NULL; \ + } while (0) +#else +#define shmi_ub_set(shi, ub) do { } while (0) +#define shmi_ub_put(shi) do { } while (0) +#endif + +UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi, + unsigned long size)) + +UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end)) +#define pages_in_vma(vma) (pages_in_vma_range(vma, \ + vma->vm_start, vma->vm_end)) + +#define UB_PAGE_WEIGHT_SHIFT 24 +#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) + +struct page_beancounter; +#define PBC_COPY_SAME ((struct page_beancounter *) 1) + +/* Mprotect charging result */ +#define PRIVVM_ERROR -1 +#define PRIVVM_NO_CHARGE 0 +#define PRIVVM_TO_PRIVATE 1 +#define PRIVVM_TO_SHARED 2 + +extern void __ub_update_physpages(struct user_beancounter *ub); +extern void __ub_update_oomguarpages(struct user_beancounter *ub); +extern void __ub_update_privvm(struct user_beancounter *ub); + +#ifdef CONFIG_BC_RSS_ACCOUNTING +#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) +#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) +#else +#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} +#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { } +#endif + +PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc)) +PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num)) +PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page, + struct mm_struct *mm, + struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page, + struct mm_struct *mm, + struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb)) +PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb)) +PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, + struct mm_struct *mm)) + +PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page)) +#endif + +#ifdef CONFIG_BC_SWAP_ACCOUNTING +#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) +#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) +#else +#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} +#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { } +#endif + +struct swap_info_struct; +SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n)) +SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si)) +SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n, + struct user_beancounter *ub)) +SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n)) diff -urNp linux-2.6.32.48/include/linux/aio.h linux-2.6.32.48-openvz/include/linux/aio.h --- linux-2.6.32.48/include/linux/aio.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/aio.h 2011-11-21 17:40:45.000000000 -0500 @@ -234,4 +234,8 @@ static inline struct kiocb *list_kiocb(s extern unsigned long aio_nr; extern unsigned long aio_max_nr; +void wait_for_all_aios(struct kioctx *ctx); +extern struct kmem_cache *kioctx_cachep; +extern void aio_kick_handler(struct work_struct *); + #endif /* __LINUX__AIO_H */ diff -urNp linux-2.6.32.48/include/linux/capability.h linux-2.6.32.48-openvz/include/linux/capability.h --- linux-2.6.32.48/include/linux/capability.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/capability.h 2011-11-21 17:40:45.000000000 -0500 @@ -197,12 +197,9 @@ struct cpu_vfs_cap_data { #define CAP_NET_BROADCAST 11 -/* Allow interface configuration */ /* Allow administration of IP firewall, masquerading and accounting */ /* Allow setting debug option on sockets */ /* Allow modification of routing tables */ -/* Allow setting arbitrary process / process group ownership on - sockets */ /* Allow binding to any address for transparent proxying */ /* Allow setting TOS (type of service) */ /* Allow setting promiscuous mode */ @@ -232,6 +229,7 @@ struct cpu_vfs_cap_data { #define CAP_SYS_MODULE 16 /* Allow ioperm/iopl access */ +/* Allow O_DIRECT access */ /* Allow sending USB messages to any device via /proc/bus/usb */ #define CAP_SYS_RAWIO 17 @@ -250,24 +248,19 @@ struct cpu_vfs_cap_data { /* Allow configuration of the secure attention key */ /* Allow administration of the random device */ -/* Allow examination and configuration of disk quotas */ /* Allow configuring the kernel's syslog (printk behaviour) */ /* Allow setting the domainname */ /* Allow setting the hostname */ /* Allow calling bdflush() */ -/* Allow mount() and umount(), setting up new smb connection */ +/* Allow setting up new smb connection */ /* Allow some autofs root ioctls */ /* Allow nfsservctl */ /* Allow VM86_REQUEST_IRQ */ /* Allow to read/write pci config on alpha */ /* Allow irix_prctl on mips (setstacksize) */ /* Allow flushing all cache on m68k (sys_cacheflush) */ -/* Allow removing semaphores */ -/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores - and shared memory */ /* Allow locking/unlocking of shared memory segment */ /* Allow turning swap on/off */ -/* Allow forged pids on socket credentials passing */ /* Allow setting readahead and flushing buffers on block devices */ /* Allow setting geometry in floppy driver */ /* Allow turning DMA on/off in xd driver */ @@ -340,6 +333,61 @@ struct cpu_vfs_cap_data { #define CAP_SETFCAP 31 +#ifdef __KERNEL__ +/* + * Important note: VZ capabilities do intersect with CAP_AUDIT + * this is due to compatibility reasons. Nothing bad. + * Both VZ and Audit/SELinux caps are disabled in VPSs. + */ + +/* Allow access to all information. In the other case some structures will be + * hiding to ensure different Virtual Environment non-interaction on the same + * node (NOW OBSOLETED) + */ +#define CAP_SETVEID 29 + +#define capable_setveid() ({ \ + ve_is_super(get_exec_env()) && \ + (capable(CAP_SYS_ADMIN) || \ + capable(CAP_VE_ADMIN)); \ + }) + +/* + * coinsides with CAP_AUDIT_CONTROL but we don't care, since + * audit is disabled in Virtuozzo + */ +#define CAP_VE_ADMIN 30 + +#ifdef CONFIG_VE + +/* Replacement for CAP_NET_ADMIN: + delegated rights to the Virtual environment of its network administration. + For now the following rights have been delegated: + + Allow setting arbitrary process / process group ownership on sockets + Allow interface configuration + */ +#define CAP_VE_NET_ADMIN CAP_VE_ADMIN + +/* Replacement for CAP_SYS_ADMIN: + delegated rights to the Virtual environment of its administration. + For now the following rights have been delegated: + */ +/* Allow mount/umount/remount */ +/* Allow examination and configuration of disk quotas */ +/* Allow removing semaphores */ +/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores + and shared memory */ +/* Allow locking/unlocking of shared memory segment */ +/* Allow forged pids on socket credentials passing */ + +#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN +#else +#define CAP_VE_NET_ADMIN CAP_NET_ADMIN +#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN +#endif +#endif + /* Override MAC access. The base kernel enforces no MAC policy. An LSM may enforce a MAC policy, and if it does and it chooses @@ -418,7 +466,16 @@ struct cpu_vfs_cap_data { #define CAP_INIT_INH_SET CAP_EMPTY_SET # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) +#ifndef CONFIG_VE # define cap_set_full(c) do { (c) = __cap_full_set; } while (0) +#else +# define cap_set_full(c) do { \ + if (ve_is_super(get_exec_env())) \ + (c) = __cap_full_set; \ + else \ + (c) = get_exec_env()->ve_cap_bset;\ + } while (0) +#endif # define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) @@ -536,6 +593,10 @@ extern const kernel_cap_t __cap_empty_se extern const kernel_cap_t __cap_full_set; extern const kernel_cap_t __cap_init_eff_set; +#include + +extern spinlock_t task_capability_lock; + /** * has_capability - Determine if a task has a superior capability available * @t: The task in question diff -urNp linux-2.6.32.48/include/linux/cgroup.h linux-2.6.32.48-openvz/include/linux/cgroup.h --- linux-2.6.32.48/include/linux/cgroup.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/cgroup.h 2011-11-21 17:40:45.000000000 -0500 @@ -220,6 +220,8 @@ struct cgroup { /* For RCU-protected deletion */ struct rcu_head rcu_head; + + int cgroup_lite_id; }; /* @@ -525,6 +527,7 @@ struct task_struct *cgroup_iter_next(str void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *, struct task_struct *); +int cgroup_set_task_css(struct task_struct *tsk, struct css_set *css); /* * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works diff -urNp linux-2.6.32.48/include/linux/cgroup_subsys.h linux-2.6.32.48-openvz/include/linux/cgroup_subsys.h --- linux-2.6.32.48/include/linux/cgroup_subsys.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/cgroup_subsys.h 2011-11-21 17:40:45.000000000 -0500 @@ -60,3 +60,9 @@ SUBSYS(net_cls) #endif /* */ + +#ifdef CONFIG_BLK_CGROUP +SUBSYS(blkio) +#endif + +/* */ diff -urNp linux-2.6.32.48/include/linux/compat.h linux-2.6.32.48-openvz/include/linux/compat.h --- linux-2.6.32.48/include/linux/compat.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/compat.h 2011-11-21 17:40:45.000000000 -0500 @@ -258,6 +258,7 @@ asmlinkage long compat_sys_settimeofday( asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int compat_printk(const char *fmt, ...); +extern int ve_compat_printk(int dst, const char *fmt, ...); extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, diff -urNp linux-2.6.32.48/include/linux/cpt_export.h linux-2.6.32.48-openvz/include/linux/cpt_export.h --- linux-2.6.32.48/include/linux/cpt_export.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/cpt_export.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,36 @@ +/* + * + * include/linux/cpt_exports.h + * + * Copyright (C) 2008 Parallels + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __CPT_EXPORTS_H__ +#define __CPT_EXPORTS_H__ + +struct cpt_context; + +struct cpt_ops { + void (*write)(const void *addr, size_t count, struct cpt_context *ctx); + void (*push_object)(loff_t *, struct cpt_context *); + void (*pop_object)(loff_t *, struct cpt_context *); + loff_t (*lookup_object)(int type, void *p, struct cpt_context *ctx); + +}; + +extern struct cpt_ops cpt_ops; + +struct rst_ops { + int (*get_object)(int type, loff_t pos, void *tmp, + int size, struct cpt_context *ctx); + struct file *(*rst_file)(loff_t pos, int fd, struct cpt_context *ctx); +}; + +extern struct rst_ops rst_ops; + +#endif + diff -urNp linux-2.6.32.48/include/linux/cpt_image.h linux-2.6.32.48-openvz/include/linux/cpt_image.h --- linux-2.6.32.48/include/linux/cpt_image.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/cpt_image.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,1842 @@ +/* + * + * include/linux/cpt_image.h + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __CPT_IMAGE_H_ +#define __CPT_IMAGE_H_ 1 + +#define CPT_NULL (~0ULL) +#define CPT_NOINDEX (~0U) + +/* + * Image file layout. + * + * - major header + * - sections[] + * + * Each section is: + * - section header + * - array of objects + * + * All data records are arch independent, 64 bit aligned. + */ + +enum _cpt_object_type +{ + CPT_OBJ_TASK = 0, + CPT_OBJ_MM, + CPT_OBJ_FS, + CPT_OBJ_FILES, + CPT_OBJ_FILE, + CPT_OBJ_SIGHAND_STRUCT, + CPT_OBJ_SIGNAL_STRUCT, + CPT_OBJ_TTY, + CPT_OBJ_SOCKET, + CPT_OBJ_SYSVSEM_UNDO, + CPT_OBJ_NAMESPACE, + CPT_OBJ_SYSV_SHM, + CPT_OBJ_INODE, + CPT_OBJ_UBC, + CPT_OBJ_SLM_SGREG, + CPT_OBJ_SLM_REGOBJ, + CPT_OBJ_SLM_MM, + CPT_OBJ_MAX, + /* The objects above are stored in memory while checkpointing */ + + CPT_OBJ_VMA = 1024, + CPT_OBJ_FILEDESC, + CPT_OBJ_SIGHANDLER, + CPT_OBJ_SIGINFO, + CPT_OBJ_LASTSIGINFO, + CPT_OBJ_SYSV_SEM, + CPT_OBJ_SKB, + CPT_OBJ_FLOCK, + CPT_OBJ_OPENREQ, + CPT_OBJ_VFSMOUNT, + CPT_OBJ_TRAILER, + CPT_OBJ_SYSVSEM_UNDO_REC, + CPT_OBJ_NET_DEVICE, + CPT_OBJ_NET_IFADDR, + CPT_OBJ_NET_ROUTE, + CPT_OBJ_NET_CONNTRACK, + CPT_OBJ_NET_CONNTRACK_EXPECT, + CPT_OBJ_AIO_CONTEXT, + CPT_OBJ_VEINFO, + CPT_OBJ_EPOLL, + CPT_OBJ_EPOLL_FILE, + CPT_OBJ_SKFILTER, + CPT_OBJ_SIGALTSTACK, + CPT_OBJ_SOCK_MCADDR, + CPT_OBJ_BIND_MNT, + CPT_OBJ_SYSVMSG, + CPT_OBJ_SYSVMSG_MSG, + + CPT_OBJ_X86_REGS = 4096, + CPT_OBJ_X86_64_REGS, + CPT_OBJ_PAGES, + CPT_OBJ_COPYPAGES, + CPT_OBJ_REMAPPAGES, + CPT_OBJ_LAZYPAGES, + CPT_OBJ_NAME, + CPT_OBJ_BITS, + CPT_OBJ_REF, + CPT_OBJ_ITERPAGES, + CPT_OBJ_ITERYOUNGPAGES, + CPT_OBJ_VSYSCALL, + CPT_OBJ_IA64_REGS, + CPT_OBJ_INOTIFY, + CPT_OBJ_INOTIFY_WATCH, + CPT_OBJ_INOTIFY_EVENT, + CPT_OBJ_TASK_AUX, + CPT_OBJ_NET_TUNTAP, + CPT_OBJ_NET_HWADDR, + CPT_OBJ_NET_VETH, + CPT_OBJ_NET_STATS, + CPT_OBJ_NET_IPIP_TUNNEL, + + /* 2.6.27-specific */ + CPT_OBJ_NET_TAP_FILTER = 0x01000000, +}; + +#define CPT_ALIGN(n) (((n)+7)&~7) + +struct cpt_major_hdr +{ + __u8 cpt_signature[4]; /* Magic number */ + __u16 cpt_hdrlen; /* Length of this header */ + __u16 cpt_image_version; /* Format of this file */ +#define CPT_VERSION_MINOR(a) ((a) & 0xf) +#define CPT_VERSION_8 0 +#define CPT_VERSION_9 0x100 +#define CPT_VERSION_9_1 0x101 +#define CPT_VERSION_9_2 0x102 +#define CPT_VERSION_16 0x200 +#define CPT_VERSION_18 0x300 +#define CPT_VERSION_18_1 0x301 +#define CPT_VERSION_18_2 0x302 +#define CPT_VERSION_18_3 0x303 +#define CPT_VERSION_20 0x400 +#define CPT_VERSION_24 0x500 +#define CPT_VERSION_26 0x600 +#define CPT_VERSION_27 0x700 +#define CPT_VERSION_27_3 0x703 +#define CPT_VERSION_32 0x800 +#define CPT_CURRENT_VERSION CPT_VERSION_32 + __u16 cpt_os_arch; /* Architecture */ +#define CPT_OS_ARCH_I386 0 +#define CPT_OS_ARCH_EMT64 1 +#define CPT_OS_ARCH_IA64 2 + __u16 __cpt_pad1; + __u32 cpt_ve_features; /* VE features */ + __u32 cpt_ve_features2; /* VE features */ + __u16 cpt_pagesize; /* Page size used by OS */ + __u16 cpt_hz; /* HZ used by OS */ + __u64 cpt_start_jiffies64; /* Jiffies */ + __u32 cpt_start_sec; /* Seconds */ + __u32 cpt_start_nsec; /* Nanoseconds */ + __u32 cpt_cpu_caps[4]; /* CPU capabilities */ + __u32 cpt_kernel_config[4]; /* Kernel config */ + __u64 cpt_iptables_mask; /* Used netfilter modules */ +} __attribute__ ((aligned (8))); + +#define CPT_SIGNATURE0 0x79 +#define CPT_SIGNATURE1 0x1c +#define CPT_SIGNATURE2 0x01 +#define CPT_SIGNATURE3 0x63 + +/* CPU capabilities */ +#define CPT_CPU_X86_CMOV 0 +#define CPT_CPU_X86_FXSR 1 +#define CPT_CPU_X86_SSE 2 +#define CPT_CPU_X86_SSE2 3 +#define CPT_CPU_X86_MMX 4 +#define CPT_CPU_X86_3DNOW 5 +#define CPT_CPU_X86_3DNOW2 6 +#define CPT_CPU_X86_SEP 7 +#define CPT_CPU_X86_EMT64 8 +#define CPT_CPU_X86_IA64 9 +#define CPT_CPU_X86_SYSCALL 10 +#define CPT_CPU_X86_SYSCALL32 11 +#define CPT_CPU_X86_SEP32 12 + +/* Unsupported features */ +#define CPT_EXTERNAL_PROCESS 16 +#define CPT_NAMESPACES 17 +#define CPT_SCHEDULER_POLICY 18 +#define CPT_PTRACED_FROM_VE0 19 +#define CPT_UNSUPPORTED_FSTYPE 20 +#define CPT_BIND_MOUNT 21 +#define CPT_UNSUPPORTED_NETDEV 22 +#define CPT_UNSUPPORTED_MISC 23 +#define CPT_SLM_DMPRST 24 + +/* This mask is used to determine whether VE + has some unsupported features or not */ +#define CPT_UNSUPPORTED_MASK 0xffff0000UL + +#define CPT_KERNEL_CONFIG_PAE 0 + +struct cpt_section_hdr +{ + __u64 cpt_next; + __u32 cpt_section; + __u16 cpt_hdrlen; + __u16 cpt_align; +} __attribute__ ((aligned (8))); + +enum +{ + CPT_SECT_ERROR, /* Error section, content is string */ + CPT_SECT_VEINFO, + CPT_SECT_FILES, /* Files. Content is array of file objects */ + CPT_SECT_TASKS, + CPT_SECT_MM, + CPT_SECT_FILES_STRUCT, + CPT_SECT_FS, + CPT_SECT_SIGHAND_STRUCT, + CPT_SECT_TTY, + CPT_SECT_SOCKET, + CPT_SECT_NAMESPACE, + CPT_SECT_SYSVSEM_UNDO, + CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and + * deleted dentires with inodes not + * referenced inside dumped process. + */ + CPT_SECT_SYSV_SHM, + CPT_SECT_SYSV_SEM, + CPT_SECT_ORPHANS, + CPT_SECT_NET_DEVICE, + CPT_SECT_NET_IFADDR, + CPT_SECT_NET_ROUTE, + CPT_SECT_NET_IPTABLES, + CPT_SECT_NET_CONNTRACK, + CPT_SECT_NET_CONNTRACK_VE0, + CPT_SECT_UTSNAME, + CPT_SECT_TRAILER, + CPT_SECT_UBC, + CPT_SECT_SLM_SGREGS, + CPT_SECT_SLM_REGOBJS, +/* Due to silly mistake we cannot index sections beyond this value */ +#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1) + CPT_SECT_EPOLL, + CPT_SECT_VSYSCALL, + CPT_SECT_INOTIFY, + CPT_SECT_SYSV_MSG, + CPT_SECT_SNMP_STATS, + CPT_SECT_MAX +}; + +struct cpt_major_tail +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_lazypages; + __u32 cpt_64bit; + __u64 cpt_sections[CPT_SECT_MAX_INDEX]; + __u32 cpt_nsect; + __u8 cpt_signature[4]; /* Magic number */ +} __attribute__ ((aligned (8))); + + +/* Common object header. */ +struct cpt_object_hdr +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; +} __attribute__ ((aligned (8))); + +enum _cpt_content_type { + CPT_CONTENT_VOID, + CPT_CONTENT_ARRAY, + CPT_CONTENT_DATA, + CPT_CONTENT_NAME, + + CPT_CONTENT_STACK, + CPT_CONTENT_X86_FPUSTATE_OLD, + CPT_CONTENT_X86_FPUSTATE, + CPT_CONTENT_MM_CONTEXT, + CPT_CONTENT_SEMARRAY, + CPT_CONTENT_SEMUNDO, + CPT_CONTENT_NLMARRAY, + CPT_CONTENT_MAX +}; + +/* CPT_OBJ_BITS: encode array of bytes */ +struct cpt_obj_bits +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_REF: a reference to another object */ +struct cpt_obj_ref +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_pos; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_VEINFO: various ve specific data */ +struct cpt_veinfo_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + /* ipc ctls */ + __u32 shm_ctl_max; + __u32 shm_ctl_all; + __u32 shm_ctl_mni; + __u32 msg_ctl_max; + __u32 msg_ctl_mni; + __u32 msg_ctl_mnb; + __u32 sem_ctl_arr[4]; + + /* start time */ + __u64 start_timespec_delta; + __u64 start_jiffies_delta; + + /* later extension */ + __u32 last_pid; + __u32 rnd_va_space; + __u64 reserved[8]; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_FILE: one struct file */ +struct cpt_file_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_flags; + __u32 cpt_mode; + __u64 cpt_pos; + __u32 cpt_uid; + __u32 cpt_gid; + + __u32 cpt_i_mode; + __u32 cpt_lflags; +#define CPT_DENTRY_DELETED 1 +#define CPT_DENTRY_ROOT 2 +#define CPT_DENTRY_CLONING 4 +#define CPT_DENTRY_PROC 8 +#define CPT_DENTRY_EPOLL 0x10 +#define CPT_DENTRY_REPLACED 0x20 +#define CPT_DENTRY_INOTIFY 0x40 +#define CPT_DENTRY_FUTEX 0x80 +#define CPT_DENTRY_TUNTAP 0x100 +#define CPT_DENTRY_PROCPID_DEAD 0x200 +#define CPT_DENTRY_HARDLINKED 0x400 +#define CPT_DENTRY_SIGNALFD 0x800 + __u64 cpt_inode; + __u64 cpt_priv; + + __u32 cpt_fown_fd; + __u32 cpt_fown_pid; +#define CPT_FOWN_STRAY_PID 0 + __u32 cpt_fown_uid; + __u32 cpt_fown_euid; + __u32 cpt_fown_signo; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by file name, encoded as CPT_OBJ_NAME */ + +struct cpt_epoll_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; +} __attribute__ ((aligned (8))); +/* Followed by array of struct cpt_epoll_file */ + +struct cpt_epoll_file_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_fd; + __u32 cpt_events; + __u64 cpt_data; + __u32 cpt_revents; + __u32 cpt_ready; +} __attribute__ ((aligned (8))); + +struct cpt_inotify_wd_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_wd; + __u32 cpt_mask; +} __attribute__ ((aligned (8))); +/* Followed by cpt_file_image of inode to watch */ + +struct cpt_inotify_ev_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_wd; + __u32 cpt_mask; + __u32 cpt_cookie; + __u32 cpt_namelen; +} __attribute__ ((aligned (8))); +/* Followed by name */ + +struct cpt_inotify_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_user; + __u32 cpt_max_events; + __u32 cpt_last_wd; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by array of struct cpt_inotify_wd_image and cpt_inotify_ev_image */ + + +/* CPT_OBJ_FILEDESC: one file descriptor */ +struct cpt_fd_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_fd; + __u32 cpt_flags; +#define CPT_FD_FLAG_CLOSEEXEC 1 + __u64 cpt_file; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_FILES: one files_struct */ +struct cpt_files_struct_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u32 cpt_max_fds; + __u32 cpt_next_fd; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by array of cpt_fd_image */ + +/* CPT_OBJ_FS: one fs_struct */ +struct cpt_fs_struct_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_umask; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */ + +/* CPT_OBJ_INODE: one struct inode */ +struct cpt_inode_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_dev; + __u64 cpt_ino; + __u32 cpt_mode; + __u32 cpt_nlink; + __u32 cpt_uid; + __u32 cpt_gid; + __u64 cpt_rdev; + __u64 cpt_size; + __u64 cpt_blksize; + __u64 cpt_atime; + __u64 cpt_mtime; + __u64 cpt_ctime; + __u64 cpt_blocks; + __u32 cpt_sb; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_VFSMOUNT: one vfsmount */ +struct cpt_vfsmount_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_mntflags; +#define CPT_MNT_BIND 0x80000000 +#define CPT_MNT_EXT 0x40000000 + __u32 cpt_flags; +} __attribute__ ((aligned (8))); + + +struct cpt_flock_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_pid; + __u64 cpt_start; + __u64 cpt_end; + __u32 cpt_flags; + __u32 cpt_type; +} __attribute__ ((aligned (8))); + + +struct cpt_tty_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_flags; + __u32 cpt_link; + __u32 cpt_index; + __u32 cpt_drv_type; + __u32 cpt_drv_subtype; + __u32 cpt_drv_flags; + __u8 cpt_packet; + __u8 cpt_stopped; + __u8 cpt_hw_stopped; + __u8 cpt_flow_stopped; + + __u32 cpt_canon_data; + __u32 cpt_canon_head; + __u32 cpt_canon_column; + __u32 cpt_column; + __u8 cpt_ctrl_status; + __u8 cpt_erasing; + __u8 cpt_lnext; + __u8 cpt_icanon; + __u8 cpt_raw; + __u8 cpt_real_raw; + __u8 cpt_closing; + __u8 __cpt_pad1; + __u16 cpt_minimum_to_wake; + __u16 __cpt_pad2; + __u32 cpt_pgrp; + __u32 cpt_session; + __u32 cpt_c_line; + __u8 cpt_name[64]; + __u16 cpt_ws_row; + __u16 cpt_ws_col; + __u16 cpt_ws_prow; + __u16 cpt_ws_pcol; + __u8 cpt_c_cc[32]; + __u32 cpt_c_iflag; + __u32 cpt_c_oflag; + __u32 cpt_c_cflag; + __u32 cpt_c_lflag; + __u32 cpt_read_flags[4096/32]; +} __attribute__ ((aligned (8))); + +struct cpt_sock_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_parent; + __u32 cpt_index; + + __u64 cpt_ssflags; + __u16 cpt_type; + __u16 cpt_family; + __u8 cpt_sstate; + __u8 cpt_passcred; + __u8 cpt_state; + __u8 cpt_reuse; + + __u8 cpt_zapped; + __u8 cpt_shutdown; + __u8 cpt_userlocks; + __u8 cpt_no_check; + __u8 cpt_debug; + __u8 cpt_rcvtstamp; + __u8 cpt_localroute; + __u8 cpt_protocol; + + __u32 cpt_err; + __u32 cpt_err_soft; + + __u16 cpt_max_ack_backlog; + __u16 __cpt_pad1; + __u32 cpt_priority; + + __u32 cpt_rcvlowat; + __u32 cpt_bound_dev_if; + + __u64 cpt_rcvtimeo; + __u64 cpt_sndtimeo; + __u32 cpt_rcvbuf; + __u32 cpt_sndbuf; + __u64 cpt_flags; + __u64 cpt_lingertime; + __u32 cpt_peer_pid; + __u32 cpt_peer_uid; + + __u32 cpt_peer_gid; + __u32 cpt_laddrlen; + __u32 cpt_laddr[128/4]; + __u32 cpt_raddrlen; + __u32 cpt_raddr[128/4]; + /* AF_UNIX */ + __u32 cpt_peer; + + __u8 cpt_socketpair; + __u8 cpt_deleted; + __u16 __cpt_pad4; + __u32 __cpt_pad5; +/* + struct sk_filter *sk_filter; + */ + + __u64 cpt_stamp; + __u32 cpt_daddr; + __u16 cpt_dport; + __u16 cpt_sport; + + __u32 cpt_saddr; + __u32 cpt_rcv_saddr; + + __u32 cpt_uc_ttl; + __u32 cpt_tos; + + __u32 cpt_cmsg_flags; + __u32 cpt_mc_index; + + __u32 cpt_mc_addr; +/* + struct ip_options *opt; + */ + __u8 cpt_hdrincl; + __u8 cpt_mc_ttl; + __u8 cpt_mc_loop; + __u8 cpt_pmtudisc; + + __u8 cpt_recverr; + __u8 cpt_freebind; + __u16 cpt_idcounter; + __u32 cpt_cork_flags; + + __u32 cpt_cork_fragsize; + __u32 cpt_cork_length; + __u32 cpt_cork_addr; + __u32 cpt_cork_saddr; + __u32 cpt_cork_daddr; + __u32 cpt_cork_oif; + + __u32 cpt_udp_pending; + __u32 cpt_udp_corkflag; + __u16 cpt_udp_encap; + __u16 cpt_udp_len; + __u32 __cpt_pad7; + + __u64 cpt_saddr6[2]; + __u64 cpt_rcv_saddr6[2]; + __u64 cpt_daddr6[2]; + __u32 cpt_flow_label6; + __u32 cpt_frag_size6; + __u32 cpt_hop_limit6; + __u32 cpt_mcast_hops6; + + __u32 cpt_mcast_oif6; + __u8 cpt_rxopt6; + __u8 cpt_mc_loop6; + __u8 cpt_recverr6; + __u8 cpt_sndflow6; + + __u8 cpt_pmtudisc6; + __u8 cpt_ipv6only6; + __u8 cpt_mapped; + __u8 __cpt_pad8; + __u32 cpt_pred_flags; + + __u32 cpt_rcv_nxt; + __u32 cpt_snd_nxt; + + __u32 cpt_snd_una; + __u32 cpt_snd_sml; + + __u32 cpt_rcv_tstamp; + __u32 cpt_lsndtime; + + __u8 cpt_tcp_header_len; + __u8 cpt_ack_pending; + __u8 cpt_quick; + __u8 cpt_pingpong; + __u8 cpt_blocked; + __u8 __cpt_pad9; + __u16 __cpt_pad10; + + __u32 cpt_ato; + __u32 cpt_ack_timeout; + + __u32 cpt_lrcvtime; + __u16 cpt_last_seg_size; + __u16 cpt_rcv_mss; + + __u32 cpt_snd_wl1; + __u32 cpt_snd_wnd; + + __u32 cpt_max_window; + __u32 cpt_pmtu_cookie; + + __u32 cpt_mss_cache; + __u16 cpt_mss_cache_std; + __u16 cpt_mss_clamp; + + __u16 cpt_ext_header_len; + __u16 cpt_ext2_header_len; + __u8 cpt_ca_state; + __u8 cpt_retransmits; + __u8 cpt_reordering; + __u8 cpt_frto_counter; + + __u32 cpt_frto_highmark; + __u8 cpt_adv_cong; + __u8 cpt_defer_accept; + __u8 cpt_backoff; + __u8 __cpt_pad11; + + __u32 cpt_srtt; + __u32 cpt_mdev; + + __u32 cpt_mdev_max; + __u32 cpt_rttvar; + + __u32 cpt_rtt_seq; + __u32 cpt_rto; + + __u32 cpt_packets_out; + __u32 cpt_left_out; + + __u32 cpt_retrans_out; + __u32 cpt_snd_ssthresh; + + __u32 cpt_snd_cwnd; + __u16 cpt_snd_cwnd_cnt; + __u16 cpt_snd_cwnd_clamp; + + __u32 cpt_snd_cwnd_used; + __u32 cpt_snd_cwnd_stamp; + + __u32 cpt_timeout; + __u32 cpt_ka_timeout; + + __u32 cpt_rcv_wnd; + __u32 cpt_rcv_wup; + + __u32 cpt_write_seq; + __u32 cpt_pushed_seq; + + __u32 cpt_copied_seq; + __u8 cpt_tstamp_ok; + __u8 cpt_wscale_ok; + __u8 cpt_sack_ok; + __u8 cpt_saw_tstamp; + + __u8 cpt_snd_wscale; + __u8 cpt_rcv_wscale; + __u8 cpt_nonagle; + __u8 cpt_keepalive_probes; + __u32 cpt_rcv_tsval; + + __u32 cpt_rcv_tsecr; + __u32 cpt_ts_recent; + + __u64 cpt_ts_recent_stamp; + __u16 cpt_user_mss; + __u8 cpt_dsack; + __u8 unused; /* was cpt_eff_sacks */ + __u32 cpt_sack_array[2*5]; + __u32 cpt_window_clamp; + + __u32 cpt_rcv_ssthresh; + __u8 cpt_probes_out; + __u8 cpt_num_sacks; + __u16 cpt_advmss; + + __u8 cpt_syn_retries; + __u8 cpt_ecn_flags; + __u16 cpt_prior_ssthresh; + __u32 cpt_lost_out; + + __u32 cpt_sacked_out; + __u32 cpt_fackets_out; + + __u32 cpt_high_seq; + __u32 cpt_retrans_stamp; + + __u32 cpt_undo_marker; + __u32 cpt_undo_retrans; + + __u32 cpt_urg_seq; + __u16 cpt_urg_data; + __u8 cpt_pending; + __u8 unused2; /* was cpt_urg_mode */ + + __u32 cpt_snd_up; + __u32 cpt_keepalive_time; + + __u32 cpt_keepalive_intvl; + __u32 cpt_linger2; + + __u32 cpt_rcvrtt_rtt; + __u32 cpt_rcvrtt_seq; + + __u32 cpt_rcvrtt_time; + __u32 __cpt_pad12; +} __attribute__ ((aligned (8))); + +struct cpt_sockmc_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u16 cpt_family; + __u16 cpt_mode; + __u32 cpt_ifindex; + __u32 cpt_mcaddr[4]; +} __attribute__ ((aligned (8))); +/* Followed by array of source addresses, each zero padded to 16 bytes */ + +struct cpt_openreq_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_rcv_isn; + __u32 cpt_snt_isn; + + __u16 cpt_rmt_port; + __u16 cpt_mss; + __u8 cpt_family; + __u8 cpt_retrans; + __u8 cpt_snd_wscale; + __u8 cpt_rcv_wscale; + + __u8 cpt_tstamp_ok; + __u8 cpt_sack_ok; + __u8 cpt_wscale_ok; + __u8 cpt_ecn_ok; + __u8 cpt_acked; + __u8 __cpt_pad1; + __u16 __cpt_pad2; + + __u32 cpt_window_clamp; + __u32 cpt_rcv_wnd; + __u32 cpt_ts_recent; + __u32 cpt_iif; + __u64 cpt_expires; + + __u64 cpt_loc_addr[2]; + __u64 cpt_rmt_addr[2]; +/* + struct ip_options *opt; + */ + +} __attribute__ ((aligned (8))); + +struct cpt_skb_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_queue; +#define CPT_SKB_NQ 0 +#define CPT_SKB_RQ 1 +#define CPT_SKB_WQ 2 +#define CPT_SKB_OFOQ 3 + + __u64 cpt_stamp; + __u32 cpt_len; + __u32 cpt_hspace; + __u32 cpt_tspace; + __u32 cpt_h; + __u32 cpt_nh; + __u32 cpt_mac; + + __u64 cpt_cb[5]; + __u32 cpt_mac_len; + __u32 cpt_csum; + __u8 cpt_local_df; + __u8 cpt_pkt_type; + __u8 cpt_ip_summed; + __u8 __cpt_pad1; + __u32 cpt_priority; + __u16 cpt_protocol; + __u16 cpt_security; + __u16 cpt_gso_segs; + __u16 cpt_gso_size; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvshm_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + + __u32 cpt_id; + __u32 cpt_mlockuser; + __u64 cpt_segsz; + __u64 cpt_atime; + __u64 cpt_ctime; + __u64 cpt_dtime; + __u64 cpt_creator; + __u64 cpt_last; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvsem_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + __u32 cpt_id; + __u32 __cpt_pad1; + + __u64 cpt_otime; + __u64 cpt_ctime; +} __attribute__ ((aligned (8))); +/* Content is array of pairs semval/sempid */ + +struct cpt_sysvsem_undo_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_id; + __u32 cpt_nsem; +} __attribute__ ((aligned (8))); + +struct cpt_sysvmsg_msg_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_type; + __u64 cpt_size; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvmsg_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + __u32 cpt_id; + __u32 __cpt_pad1; + + __u64 cpt_stime; + __u64 cpt_rtime; + __u64 cpt_ctime; + __u64 cpt_last_sender; + __u64 cpt_last_receiver; + __u64 cpt_qbytes; +} __attribute__ ((aligned (8))); +/* Content is array of sysv msg */ + + +struct cpt_mm_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start_code; + __u64 cpt_end_code; + __u64 cpt_start_data; + __u64 cpt_end_data; + __u64 cpt_start_brk; + __u64 cpt_brk; + __u64 cpt_start_stack; + __u64 cpt_start_arg; + __u64 cpt_end_arg; + __u64 cpt_start_env; + __u64 cpt_end_env; + __u64 cpt_def_flags; + __u64 cpt_mmub; + __u8 cpt_dumpable; + __u8 cpt_vps_dumpable; + __u8 cpt_used_hugetlb; + __u8 __cpt_pad; + __u32 cpt_vdso; +} __attribute__ ((aligned (8))); + +struct cpt_page_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; +} __attribute__ ((aligned (8))); + +struct cpt_remappage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_pgoff; +} __attribute__ ((aligned (8))); + +struct cpt_copypage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_source; +} __attribute__ ((aligned (8))); + +struct cpt_lazypage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_index; +} __attribute__ ((aligned (8))); + +struct cpt_iterpage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; +} __attribute__ ((aligned (8))); +/* Followed by array of PFNs */ + +struct cpt_vma_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_type; +#define CPT_VMA_TYPE_0 0 +#define CPT_VMA_TYPE_SHM 1 +#define CPT_VMA_VDSO 2 + __u32 cpt_anonvma; + __u64 cpt_anonvmaid; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_flags; + __u64 cpt_pgprot; + __u64 cpt_pgoff; +} __attribute__ ((aligned (8))); + +struct cpt_aio_ctx_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_max_reqs; + __u32 cpt_ring_pages; + __u32 cpt_tail; + __u32 cpt_nr; + __u64 cpt_mmap_base; + /* Data (io_event's) and struct aio_ring are stored in user space VM */ +} __attribute__ ((aligned (8))); + + +/* Format of MM section. + * + * It is array of MM objects (mm_struct). Each MM object is + * header, encoding mm_struct, followed by array of VMA objects. + * Each VMA consists of VMA header, encoding vm_area_struct, and + * if the VMA contains copied pages, the header is followed by + * array of tuples start-end each followed by data. + * + * ATTN: no block/page alignment. Only 64bit alignment. This might be not good? + */ + +struct cpt_restart_block { + __u64 fn; +#define CPT_RBL_0 0 +#define CPT_RBL_NANOSLEEP 1 +#define CPT_RBL_COMPAT_NANOSLEEP 2 +#define CPT_RBL_POLL 3 +#define CPT_RBL_FUTEX_WAIT 4 + __u64 arg0; + __u64 arg1; + __u64 arg2; + __u64 arg3; +} __attribute__ ((aligned (8))); + +struct cpt_siginfo_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_qflags; + __u32 cpt_signo; + __u32 cpt_errno; + __u32 cpt_code; + + __u64 cpt_sigval; + __u32 cpt_pid; + __u32 cpt_uid; + __u64 cpt_utime; + __u64 cpt_stime; + + __u64 cpt_user; +} __attribute__ ((aligned (8))); + +/* Portable presentaions for segment registers */ + +#define CPT_SEG_ZERO 0 +#define CPT_SEG_TLS1 1 +#define CPT_SEG_TLS2 2 +#define CPT_SEG_TLS3 3 +#define CPT_SEG_USER32_DS 4 +#define CPT_SEG_USER32_CS 5 +#define CPT_SEG_USER64_DS 6 +#define CPT_SEG_USER64_CS 7 +#define CPT_SEG_LDT 256 + +struct cpt_x86_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_debugreg[8]; + __u32 cpt_fs; + __u32 cpt_gs; + + __u32 cpt_ebx; + __u32 cpt_ecx; + __u32 cpt_edx; + __u32 cpt_esi; + __u32 cpt_edi; + __u32 cpt_ebp; + __u32 cpt_eax; + __u32 cpt_xds; + __u32 cpt_xes; + __u32 cpt_orig_eax; + __u32 cpt_eip; + __u32 cpt_xcs; + __u32 cpt_eflags; + __u32 cpt_esp; + __u32 cpt_xss; + __u32 cpt_ugs; +}; + +struct cpt_x86_64_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_debugreg[8]; + + __u64 cpt_fsbase; + __u64 cpt_gsbase; + __u32 cpt_fsindex; + __u32 cpt_gsindex; + __u32 cpt_ds; + __u32 cpt_es; + + __u64 cpt_r15; + __u64 cpt_r14; + __u64 cpt_r13; + __u64 cpt_r12; + __u64 cpt_rbp; + __u64 cpt_rbx; + __u64 cpt_r11; + __u64 cpt_r10; + __u64 cpt_r9; + __u64 cpt_r8; + __u64 cpt_rax; + __u64 cpt_rcx; + __u64 cpt_rdx; + __u64 cpt_rsi; + __u64 cpt_rdi; + __u64 cpt_orig_rax; + __u64 cpt_rip; + __u64 cpt_cs; + __u64 cpt_eflags; + __u64 cpt_rsp; + __u64 cpt_ss; +}; + +struct cpt_ia64_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 gr[128]; + __u64 fr[256]; + __u64 br[8]; + __u64 nat[2]; + + __u64 ar_bspstore; + __u64 num_regs; + __u64 loadrs; + __u64 ar_bsp; + __u64 ar_unat; + __u64 ar_pfs; + __u64 ar_ccv; + __u64 ar_fpsr; + __u64 ar_csd; + __u64 ar_ssd; + __u64 ar_ec; + __u64 ar_lc; + __u64 ar_rsc; + __u64 ar_rnat; + + __u64 cr_iip; + __u64 cr_ipsr; + + __u64 cfm; + __u64 pr; + + __u64 ibr[8]; + __u64 dbr[8]; +}; + + +struct cpt_task_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_state; + __u64 cpt_flags; +#define CPT_TASK_FLAGS_MASK (PF_EXITING | PF_FORKNOEXEC | \ + PF_SUPERPRIV | PF_DUMPCORE | PF_SIGNALED) + __u64 cpt_ptrace; + __u32 cpt_prio; + __u32 cpt_static_prio; + __u32 cpt_policy; + __u32 cpt_rt_priority; + + /* struct thread_info */ + __u64 cpt_exec_domain; + __u64 cpt_thrflags; + __u64 cpt_thrstatus; + __u64 cpt_addr_limit; + + __u64 cpt_personality; + + __u64 cpt_mm; + __u64 cpt_files; + __u64 cpt_fs; + __u64 cpt_signal; + __u64 cpt_sighand; + __u64 cpt_sigblocked; + __u64 cpt_sigrblocked; + __u64 cpt_sigpending; + __u64 cpt_namespace; + __u64 cpt_sysvsem_undo; + __u32 cpt_pid; + __u32 cpt_tgid; + __u32 cpt_ppid; + __u32 cpt_rppid; + __u32 cpt_pgrp; + __u32 cpt_session; + __u32 cpt_old_pgrp; + __u32 __cpt_pad; + __u32 cpt_leader; + __u8 cpt_pn_state; + __u8 cpt_stopped_state; + __u8 cpt_sigsuspend_state; + __u8 cpt_64bit; + __u64 cpt_set_tid; + __u64 cpt_clear_tid; + __u32 cpt_exit_code; + __u32 cpt_exit_signal; + __u32 cpt_pdeath_signal; + __u32 cpt_user; + __u32 cpt_uid; + __u32 cpt_euid; + __u32 cpt_suid; + __u32 cpt_fsuid; + __u32 cpt_gid; + __u32 cpt_egid; + __u32 cpt_sgid; + __u32 cpt_fsgid; + __u32 cpt_ngids; + __u32 cpt_gids[32]; + __u8 cpt_prctl_uac; + __u8 cpt_prctl_fpemu; + __u16 __cpt_pad1; + __u64 cpt_ecap; + __u64 cpt_icap; + __u64 cpt_pcap; + __u8 cpt_comm[16]; + __u64 cpt_tls[3]; + struct cpt_restart_block cpt_restart; + __u64 cpt_it_real_value; /* V8: jiffies, V9..: nsec */ + __u64 cpt_it_real_incr; /* V8: jiffies, V9..: nsec */ + __u64 cpt_it_prof_value; + __u64 cpt_it_prof_incr; + __u64 cpt_it_virt_value; + __u64 cpt_it_virt_incr; + + __u16 cpt_used_math; + __u8 cpt_keepcap; + __u8 cpt_did_exec; + __u32 cpt_ptrace_message; + + __u64 cpt_utime; + __u64 cpt_stime; + __u64 cpt_starttime; /* V8: jiffies, V9...: timespec */ + __u64 cpt_nvcsw; + __u64 cpt_nivcsw; + __u64 cpt_min_flt; + __u64 cpt_maj_flt; + + __u64 cpt_sigsuspend_blocked; + __u64 cpt_cutime, cpt_cstime; + __u64 cpt_cnvcsw, cpt_cnivcsw; + __u64 cpt_cmin_flt, cpt_cmaj_flt; + +#define CPT_RLIM_NLIMITS 16 + __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; + __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; + + __u64 cpt_task_ub; + __u64 cpt_exec_ub; + __u64 cpt_mm_ub; + __u64 cpt_fork_sub; +} __attribute__ ((aligned (8))); + +struct cpt_sigaltstack_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_stack; + __u32 cpt_stacksize; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +struct cpt_task_aux_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_robust_list; + __u64 __cpt_future[16]; +} __attribute__ ((aligned (8))); + + +struct cpt_signal_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_leader; + __u8 cpt_pgrp_type; + __u8 cpt_old_pgrp_type; + __u8 cpt_session_type; +#define CPT_PGRP_NORMAL 0 +#define CPT_PGRP_ORPHAN 1 +#define CPT_PGRP_STRAY 2 + __u8 __cpt_pad1; + __u64 cpt_pgrp; + __u64 cpt_old_pgrp; + __u64 cpt_session; + __u64 cpt_sigpending; + __u64 cpt_ctty; + + __u32 cpt_curr_target; + __u32 cpt_group_exit; + __u32 cpt_group_exit_code; + __u32 cpt_group_exit_task; + __u32 cpt_notify_count; + __u32 cpt_group_stop_count; + __u32 cpt_stop_state; + __u32 __cpt_pad2; + + __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime; + __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw; + __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt; + + __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; + __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; +} __attribute__ ((aligned (8))); +/* Followed by list of posix timers. */ + +struct cpt_sighand_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + +} __attribute__ ((aligned (8))); +/* Followed by list of sighandles. */ + +struct cpt_sighandler_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_signo; + __u32 __cpt_pad1; + __u64 cpt_handler; + __u64 cpt_restorer; + __u64 cpt_flags; + __u64 cpt_mask; +} __attribute__ ((aligned (8))); + +struct cpt_netdev_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u32 cpt_flags; + __u8 cpt_name[16]; +} __attribute__ ((aligned (8))); + +struct cpt_tuntap_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 unused; /* was cpt_attached */ + __u64 cpt_flags; + __u64 cpt_bindfile; + __u64 cpt_if_flags; + __u8 cpt_dev_addr[6]; + __u16 cpt_pad; + __u32 cpt_chr_filter[2]; + __u32 cpt_net_filter[2]; +} __attribute__ ((aligned (8))); + +struct cpt_tap_filter_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_count; + __u32 cpt_mask[2]; + __u8 cpt_addr[8][6]; +} __attribute__ ((aligned (8))); + +struct cpt_veth_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_allow_mac_change; + __u32 __cpt_pad; +} __attribute__ ((aligned (8))); + +struct cpt_tunnel_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_tnl_flags; +#define CPT_TUNNEL_FBDEV 0x1 +#define CPT_TUNNEL_SIT 0x2 +#define CPT_TUNNEL_GRE 0x4 + __u16 cpt_i_flags; + __u16 cpt_o_flags; + __u32 cpt_i_key; + __u32 cpt_o_key; + __u32 cpt_iphdr[5]; + __u32 cpt_i_seqno; + __u32 cpt_o_seqno; +} __attribute__ ((aligned (8))); + +struct cpt_hwaddr_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u8 cpt_dev_addr[32]; +} __attribute__ ((aligned (8))); + +struct cpt_netstats_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_rx_packets; + __u64 cpt_tx_packets; + __u64 cpt_rx_bytes; + __u64 cpt_tx_bytes; + __u64 cpt_rx_errors; + __u64 cpt_tx_errors; + __u64 cpt_rx_dropped; + __u64 cpt_tx_dropped; + __u64 cpt_multicast; + __u64 cpt_collisions; + __u64 cpt_rx_length_errors; + __u64 cpt_rx_over_errors; + __u64 cpt_rx_crc_errors; + __u64 cpt_rx_frame_errors; + __u64 cpt_rx_fifo_errors; + __u64 cpt_rx_missed_errors; + __u64 cpt_tx_aborted_errors; + __u64 cpt_tx_carrier_errors; + __u64 cpt_tx_fifo_errors; + __u64 cpt_tx_heartbeat_errors; + __u64 cpt_tx_window_errors; + __u64 cpt_rx_compressed; + __u64 cpt_tx_compressed; + __u64 pad[4]; +} __attribute__ ((aligned (8))); + +struct cpt_ifaddr_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u8 cpt_family; + __u8 cpt_masklen; + __u8 cpt_flags; + __u8 cpt_scope; + __u32 cpt_address[4]; + __u32 cpt_peer[4]; + __u32 cpt_broadcast[4]; + __u8 cpt_label[16]; + __u32 cpt_valid_lft; + __u32 cpt_prefered_lft; +} __attribute__ ((aligned (8))); + +struct cpt_ipct_tuple +{ + __u32 cpt_src; + __u16 cpt_srcport; + __u16 __cpt_pad1; + + __u32 cpt_dst; + __u16 cpt_dstport; + __u8 cpt_protonum; + __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */ +} __attribute__ ((aligned (8))); + +struct cpt_nat_manip +{ + __u8 cpt_direction; + __u8 cpt_hooknum; + __u8 cpt_maniptype; + __u8 __cpt_pad1; + + __u32 cpt_manip_addr; + __u16 cpt_manip_port; + __u16 __cpt_pad2; + __u32 __cpt_pad3; +} __attribute__ ((aligned (8))); + +struct cpt_nat_seq +{ + __u32 cpt_correction_pos; + __u32 cpt_offset_before; + __u32 cpt_offset_after; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +struct cpt_ip_connexpect_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_timeout; + __u32 cpt_sibling_conntrack; /* Index of child conntrack */ + __u32 cpt_seq; /* id in 2.6.15 */ + + struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */ + struct cpt_ipct_tuple cpt_tuple; + struct cpt_ipct_tuple cpt_mask; + + /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */ + __u32 cpt_help[3]; /* NU 2.6.15 */ + __u16 cpt_manip_proto; + __u8 cpt_dir; + __u8 cpt_flags; +} __attribute__ ((aligned (8))); + +struct cpt_ip_conntrack_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + struct cpt_ipct_tuple cpt_tuple[2]; + __u64 cpt_status; + __u64 cpt_timeout; + __u32 cpt_index; + __u8 cpt_ct_helper; + __u8 cpt_nat_helper; + __u16 cpt_pad1; + + /* union ip_conntrack_proto. Used by tcp and icmp. */ + __u32 cpt_proto_data[12]; + + /* union ip_conntrack_help. Used by ftp and pptp helper. + * We do not support pptp... + */ + __u32 cpt_help_data[6]; + + /* nat info */ + __u32 cpt_initialized; /* NU 2.6.15 */ + __u32 cpt_num_manips; /* NU 2.6.15 */ + struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */ + + struct cpt_nat_seq cpt_nat_seq[2]; + + __u32 cpt_masq_index; + __u32 cpt_id; + __u32 cpt_mark; +} __attribute__ ((aligned (8))); + +/* cpt_ip_conntrack_image struct from 2.6.9 kernel */ +struct cpt_ip_conntrack_image_compat +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + struct cpt_ipct_tuple cpt_tuple[2]; + __u64 cpt_status; + __u64 cpt_timeout; + __u32 cpt_index; + __u8 cpt_ct_helper; + __u8 cpt_nat_helper; + __u16 __cpt_pad1; + + /* union ip_conntrack_proto. Used by tcp and icmp. */ + __u32 cpt_proto_data[12]; + + /* union ip_conntrack_help. Used only by ftp helper. */ + __u32 cpt_help_data[4]; + + /* nat info */ + __u32 cpt_initialized; + __u32 cpt_num_manips; + struct cpt_nat_manip cpt_nat_manips[6]; + + struct cpt_nat_seq cpt_nat_seq[2]; + + __u32 cpt_masq_index; + __u32 __cpt_pad2; +} __attribute__ ((aligned (8))); + +struct cpt_ubparm +{ + __u64 barrier; + __u64 limit; + __u64 held; + __u64 maxheld; + __u64 minheld; + __u64 failcnt; +} __attribute__ ((aligned (8))); + +struct cpt_beancounter_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_parent; + __u32 cpt_id; + __u32 cpt_ub_resources; + struct cpt_ubparm cpt_parms[32 * 2]; +} __attribute__ ((aligned (8))); + +struct cpt_slm_sgreg_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; + __u32 cpt_id; + __u16 cpt_resource; + __u8 cpt_regname[32]; + __u8 __cpt_pad2[2]; +} __attribute__ ((aligned (8))); + +struct cpt_slm_obj_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +#ifdef __KERNEL__ + +static inline void __user * cpt_ptr_import(__u64 ptr) +{ + return (void*)(unsigned long)ptr; +} + +static inline __u64 cpt_ptr_export(void __user *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr) +{ + memcpy(sig, &ptr, sizeof(*sig)); +} + +static inline __u64 cpt_sigset_export(sigset_t *sig) +{ + return *(__u64*)sig; +} + +static inline __u64 cpt_timespec_export(struct timespec *tv) +{ + return (((u64)tv->tv_sec) << 32) + tv->tv_nsec; +} + +static inline void cpt_timespec_import(struct timespec *tv, __u64 val) +{ + tv->tv_sec = val>>32; + tv->tv_nsec = (val&0xFFFFFFFF); +} + +static inline __u64 cpt_timeval_export(struct timeval *tv) +{ + return (((u64)tv->tv_sec) << 32) + tv->tv_usec; +} + +static inline void cpt_timeval_import(struct timeval *tv, __u64 val) +{ + tv->tv_sec = val>>32; + tv->tv_usec = (val&0xFFFFFFFF); +} + +#endif + +#endif /* __CPT_IMAGE_H_ */ diff -urNp linux-2.6.32.48/include/linux/cpt_ioctl.h linux-2.6.32.48-openvz/include/linux/cpt_ioctl.h --- linux-2.6.32.48/include/linux/cpt_ioctl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/cpt_ioctl.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,45 @@ +/* + * + * include/linux/cpt_ioctl.h + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _CPT_IOCTL_H_ +#define _CPT_IOCTL_H_ 1 + +#include +#include + +#define CPTCTLTYPE '-' +#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int) +#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int) +#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int) +#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int) +#define CPT_SUSPEND _IO(CPTCTLTYPE, 5) +#define CPT_DUMP _IO(CPTCTLTYPE, 6) +#define CPT_UNDUMP _IO(CPTCTLTYPE, 7) +#define CPT_RESUME _IO(CPTCTLTYPE, 8) +#define CPT_KILL _IO(CPTCTLTYPE, 9) +#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10) +#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int) +#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12) +#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int) +#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int) +#define CPT_PAGEIND _IO(CPTCTLTYPE, 15) +#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int) +#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int) +#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int) +#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int) +#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int) +#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int) + +#define CPT_ITER _IOW(CPTCTLTYPE, 23, int) +#define CPT_LINKDIR_ADD _IOW(CPTCTLTYPE, 24, int) +#define CPT_HARDLNK_ON _IOW(CPTCTLTYPE, 25, int) + +#endif diff -urNp linux-2.6.32.48/include/linux/dcache.h linux-2.6.32.48-openvz/include/linux/dcache.h --- linux-2.6.32.48/include/linux/dcache.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/dcache.h 2011-11-21 17:40:45.000000000 -0500 @@ -8,6 +8,8 @@ #include #include +#include + struct nameidata; struct path; struct vfsmount; @@ -116,6 +118,9 @@ struct dentry { struct super_block *d_sb; /* The root of the dentry tree */ void *d_fsdata; /* fs-specific data */ +#ifdef CONFIG_BEANCOUNTERS + struct dentry_beancounter dentry_bc; +#endif unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ }; @@ -186,6 +191,10 @@ d_iput: no no no yes #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ +#define DCACHE_VIRTUAL 0x0100 /* ve accessible */ + +extern void mark_tree_virtual(struct path *path); +extern struct kmem_cache *dentry_cache; extern spinlock_t dcache_lock; extern seqlock_t rename_lock; @@ -314,6 +323,7 @@ extern char *dynamic_dname(struct dentry extern char *__d_path(const struct path *path, struct path *root, char *, int); extern char *d_path(const struct path *, char *, int); extern char *dentry_path(struct dentry *, char *, int); +extern int d_root_check(struct path *path); /* Allocation counts.. */ @@ -333,6 +343,12 @@ extern char *dentry_path(struct dentry * static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { +#ifdef CONFIG_BEANCOUNTERS + preempt_disable(); + if (ub_dentry_on && ub_dget_testone(dentry)) + BUG(); + preempt_enable_no_resched(); +#endif BUG_ON(!atomic_read(&dentry->d_count)); atomic_inc(&dentry->d_count); } @@ -380,4 +396,5 @@ extern struct dentry *lookup_create(stru extern int sysctl_vfs_cache_pressure; +extern int check_area_access_ve(struct path *); #endif /* __LINUX_DCACHE_H */ diff -urNp linux-2.6.32.48/include/linux/device.h linux-2.6.32.48-openvz/include/linux/device.h --- linux-2.6.32.48/include/linux/device.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/device.h 2011-11-21 17:40:45.000000000 -0500 @@ -213,8 +213,16 @@ struct class_dev_iter { const struct device_type *type; }; +#ifndef CONFIG_VE extern struct kobject *sysfs_dev_block_kobj; extern struct kobject *sysfs_dev_char_kobj; +#define ve_sysfs_dev_block_kobj sysfs_dev_block_kobj +#define ve_sysfs_dev_char_kobj sysfs_dev_char_kobj +#else +#define ve_sysfs_dev_block_kobj (get_exec_env()->dev_block_kobj) +#define ve_sysfs_dev_char_kobj (get_exec_env()->dev_char_kobj) +#endif + extern int __must_check __class_register(struct class *class, struct lock_class_key *key); extern void class_unregister(struct class *class); @@ -279,6 +287,15 @@ extern struct class * __must_check __cla struct lock_class_key *key); extern void class_destroy(struct class *cls); +extern struct class net_class; +extern struct kset *class_kset; + +int classes_init(void); +void classes_fini(void); + +int devices_init(void); +void devices_fini(void); + /* This is a #define to keep the compiler from merging different * instances of the __key variable */ #define class_create(owner, name) \ diff -urNp linux-2.6.32.48/include/linux/devpts_fs.h linux-2.6.32.48-openvz/include/linux/devpts_fs.h --- linux-2.6.32.48/include/linux/devpts_fs.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/devpts_fs.h 2011-11-21 17:40:45.000000000 -0500 @@ -26,6 +26,7 @@ struct tty_struct *devpts_get_tty(struct /* unlink */ void devpts_pty_kill(struct tty_struct *tty); +extern struct file_system_type devpts_fs_type; #else /* Dummy stubs in the no-pty case */ diff -urNp linux-2.6.32.48/include/linux/elf.h linux-2.6.32.48-openvz/include/linux/elf.h --- linux-2.6.32.48/include/linux/elf.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/elf.h 2011-11-21 17:40:45.000000000 -0500 @@ -406,5 +406,7 @@ static inline int elf_coredump_extra_not extern int elf_coredump_extra_notes_size(void); extern int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset); #endif +extern int sysctl_at_vsyscall; + #endif /* __KERNEL__ */ #endif /* _LINUX_ELF_H */ diff -urNp linux-2.6.32.48/include/linux/eventpoll.h linux-2.6.32.48-openvz/include/linux/eventpoll.h --- linux-2.6.32.48/include/linux/eventpoll.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/eventpoll.h 2011-11-21 17:40:45.000000000 -0500 @@ -17,6 +17,7 @@ /* For O_CLOEXEC */ #include #include +#include /* Flags for epoll_create1. */ #define EPOLL_CLOEXEC O_CLOEXEC @@ -63,6 +64,94 @@ static inline void eventpoll_init_file(s INIT_LIST_HEAD(&file->f_ep_links); } +struct epoll_filefd { + struct file *file; + int fd; +}; + +/* + * This structure is stored inside the "private_data" member of the file + * structure and rapresent the main data sructure for the eventpoll + * interface. + */ +struct eventpoll { + /* Protect the this structure access */ + spinlock_t lock; + + /* + * This mutex is used to ensure that files are not removed + * while epoll is using them. This is held during the event + * collection loop, the file cleanup path, the epoll file exit + * code and the ctl operations. + */ + struct mutex mtx; + + /* Wait queue used by sys_epoll_wait() */ + wait_queue_head_t wq; + + /* Wait queue used by file->poll() */ + wait_queue_head_t poll_wait; + + /* List of ready file descriptors */ + struct list_head rdllist; + + /* RB tree root used to store monitored fd structs */ + struct rb_root rbr; + + /* + * This is a single linked list that chains all the "struct epitem" that + * happened while transfering ready events to userspace w/out + * holding ->lock. + */ + struct epitem *ovflist; + + /* The user that created the eventpoll descriptor */ + struct user_struct *user; +}; + +/* + * Each file descriptor added to the eventpoll interface will + * have an entry of this type linked to the "rbr" RB tree. + */ +struct epitem { + /* RB tree node used to link this structure to the eventpoll RB tree */ + struct rb_node rbn; + + /* List header used to link this structure to the eventpoll ready list */ + struct list_head rdllink; + + /* + * Works together "struct eventpoll"->ovflist in keeping the + * single linked chain of items. + */ + struct epitem *next; + + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; + + /* Number of active wait queue attached to poll operations */ + int nwait; + + /* List containing poll wait queues */ + struct list_head pwqlist; + + /* The "container" of this item */ + struct eventpoll *ep; + + /* List header used to link this item to the "struct file" items list */ + struct list_head fllink; + + /* The structure that describe the interested events and the source fd */ + struct epoll_event event; + + /* The user that created the eventpoll descriptor */ + struct user_struct *user; +}; + +extern struct semaphore epsem; +struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); +int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd); /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); @@ -95,6 +184,8 @@ static inline void eventpoll_release(str eventpoll_release_file(file); } +extern struct mutex epmutex; + #else static inline void eventpoll_init_file(struct file *file) {} diff -urNp linux-2.6.32.48/include/linux/fairsched.h linux-2.6.32.48-openvz/include/linux/fairsched.h --- linux-2.6.32.48/include/linux/fairsched.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/fairsched.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,92 @@ +/* + * Fair Scheduler + * + * Copyright (C) 2000-2008 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_FAIRSCHED_H__ +#define __LINUX_FAIRSCHED_H__ + +#define FAIRSCHED_SET_RATE 0 +#define FAIRSCHED_DROP_RATE 1 +#define FAIRSCHED_GET_RATE 2 + +#ifdef __KERNEL__ + +/* refcnt change protected with tasklist write lock */ +struct fairsched_node { + struct task_group *tg; + int refcnt; + unsigned id; + struct list_head nodelist; + + unsigned weight; + unsigned char rate_limited; + unsigned rate; +#ifdef CONFIG_VE + struct ve_struct *owner_env; +#endif +}; + +#ifdef CONFIG_VZ_FAIRSCHED + +#define FAIRSCHED_INIT_NODE_ID INT_MAX + +extern struct fairsched_node fairsched_init_node; + +void fairsched_init_early(void); +void fairsched_init_late(void); + +static inline int task_fairsched_node_id(struct task_struct *p) +{ + return p->fsched_node->id; +} + +/* must called with tasklist write locked */ +static inline void get_task_fairsched_node(struct task_struct *p) +{ + p->fsched_node->refcnt++; +} +static inline void put_task_fairsched_node(struct task_struct *p) +{ + p->fsched_node->refcnt--; +} + +#define INIT_VZ_FAIRSCHED .fsched_node = &fairsched_init_node, + +#define FSCHWEIGHT_MAX ((1 << 16) - 1) +#define FSCHRATE_SHIFT 10 +#define FSCH_TIMESLICE 16 + +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid); +asmlinkage int sys_fairsched_rmnod(unsigned int id); +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid); +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus); +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight); +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate); + +int fairsched_new_node(int id, unsigned int vcpus); +void fairsched_drop_node(int id); + +#else /* CONFIG_VZ_FAIRSCHED */ + +static inline void fairsched_init_early(void) { } +static inline void fairsched_init_late(void) { } +static inline int task_fairsched_node_id(struct task_struct *p) { return 0; } +static inline void get_task_fairsched_node(struct task_struct *p) { } +static inline void put_task_fairsched_node(struct task_struct *p) { } + +static inline int fairsched_new_node(int id, unsigned int vcpus) { return 0; } +static inline void fairsched_drop_node(int id) { } + +#define INIT_VZ_FAIRSCHED + +#endif /* CONFIG_VZ_FAIRSCHED */ +#endif /* __KERNEL__ */ + +#endif /* __LINUX_FAIRSCHED_H__ */ diff -urNp linux-2.6.32.48/include/linux/faudit.h linux-2.6.32.48-openvz/include/linux/faudit.h --- linux-2.6.32.48/include/linux/faudit.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/faudit.h 2011-11-21 17:40:45.000000000 -0500 @@ -0,0 +1,45 @@ +/* + * include/linux/faudit.h + * + * Copyright (C) 2005 SWSoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __FAUDIT_H_ +#define __FAUDIT_H_ + +#include + +struct vfsmount; +struct dentry; +struct super_block; +struct kstatfs; +struct kstat; +struct pt_regs; + +struct faudit_regs_arg { + int err; + struct pt_regs *regs; +}; + +struct faudit_stat_arg { + int err; + struct vfsmount *mnt; + struct dentry *dentry; + struct kstat *stat; +}; + +struct faudit_statfs_arg { + int err; + struct super_block *sb; + struct kstatfs *stat; +}; + +#define VIRTINFO_FAUDIT (0) +#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0) +#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1) + +#endif diff -urNp linux-2.6.32.48/include/linux/file.h linux-2.6.32.48-openvz/include/linux/file.h --- linux-2.6.32.48/include/linux/file.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/file.h 2011-11-21 17:40:45.000000000 -0500 @@ -41,4 +41,6 @@ extern void put_unused_fd(unsigned int f extern void fd_install(unsigned int fd, struct file *file); +extern struct kmem_cache *filp_cachep; + #endif /* __LINUX_FILE_H */ diff -urNp linux-2.6.32.48/include/linux/freezer.h linux-2.6.32.48-openvz/include/linux/freezer.h --- linux-2.6.32.48/include/linux/freezer.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/freezer.h 2011-11-21 17:40:45.000000000 -0500 @@ -163,6 +163,8 @@ static inline void set_freezable_with_si } while (try_to_freeze()); \ __retval; \ }) + +extern atomic_t global_suspend; #else /* !CONFIG_FREEZER */ static inline int frozen(struct task_struct *p) { return 0; } static inline int freezing(struct task_struct *p) { return 0; } diff -urNp linux-2.6.32.48/include/linux/fs.h linux-2.6.32.48-openvz/include/linux/fs.h --- linux-2.6.32.48/include/linux/fs.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/fs.h 2011-11-21 17:40:45.000000000 -0500 @@ -53,6 +53,7 @@ struct inodes_stat_t { #define MAY_APPEND 8 #define MAY_ACCESS 16 #define MAY_OPEN 32 +#define MAY_QUOTACTL 64 /* for devgroup-vs-openvz only */ /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond @@ -78,6 +79,8 @@ struct inodes_stat_t { /* File is opened using open(.., 3, ..) and is writeable only for ioctls (specialy hack for floppy.c) */ #define FMODE_WRITE_IOCTL ((__force fmode_t)256) +/* Can do sys_quotactl (for devperms) */ +#define FMODE_QUOTACTL ((__force fmode_t)512) /* * Don't update ctime and mtime. @@ -175,6 +178,8 @@ struct inodes_stat_t { #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 +#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */ +#define FS_MANGLE_PROC 128 /* hide some /proc/mounts info inside VE */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() * during rename() internally. @@ -235,6 +240,9 @@ struct inodes_stat_t { #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE 512 /* Inode is fs-internal */ +/* VZ flags -- These are not upstream! */ +#define S_NOUNUSE (1 << 17) /* just destroy inode in cleanup */ + /* * Note that nosuid etc flags are inode-specific: setting some file-system * flags just means all the inodes inherit those flags by default. It might be @@ -370,7 +378,6 @@ struct inodes_stat_t { #include #include #include -#include #include #include #include @@ -405,6 +412,7 @@ extern int get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; +extern int odirect_enable; #ifdef CONFIG_DNOTIFY extern int dir_notify_enable; #endif @@ -464,10 +472,15 @@ struct iattr { struct file *ia_file; }; +#include + /* * Includes for diskquotas. */ #include +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) +#include +#endif /** * enum positive_aop_returns - aop return codes with specific semantics @@ -755,6 +768,9 @@ struct inode { #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + struct vz_quota_ilink i_qlnk; +#endif struct list_head i_devices; union { struct pipe_inode_info *i_pipe; @@ -810,6 +826,8 @@ enum inode_i_mutex_lock_class I_MUTEX_QUOTA }; +extern struct kmem_cache *inode_cachep; + /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic @@ -930,6 +948,7 @@ struct file { struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra; + struct user_beancounter *f_ub; u64 f_version; #ifdef CONFIG_SECURITY @@ -946,6 +965,7 @@ struct file { #ifdef CONFIG_DEBUG_WRITECOUNT unsigned long f_mnt_write_state; #endif + struct ve_struct *owner_env; }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); @@ -1064,6 +1084,9 @@ struct file_lock { fl_owner_t fl_owner; unsigned char fl_flags; unsigned char fl_type; +#ifdef CONFIG_BEANCOUNTERS + unsigned char fl_charged; +#endif unsigned int fl_pid; struct pid *fl_nspid; wait_queue_head_t fl_wait; @@ -1512,6 +1535,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); + struct file * (*get_host)(struct file *); }; struct inode_operations { @@ -1581,6 +1605,7 @@ struct super_operations { #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + struct inode *(*get_quota_root)(struct super_block *); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); }; @@ -1758,8 +1783,14 @@ struct file_system_type { struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; struct lock_class_key i_alloc_sem_key; + + struct file_system_type *proto; + struct ve_struct *owner_env; }; +void get_filesystem(struct file_system_type *fs); +void put_filesystem(struct file_system_type *fs); + extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int), struct vfsmount *mnt); @@ -1803,13 +1834,20 @@ extern int register_filesystem(struct fi extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); #define kern_mount(type) kern_mount_data(type, NULL) +extern int register_ve_fs_type(struct ve_struct *, struct file_system_type *, + struct file_system_type **, struct vfsmount **); +extern void unregister_ve_fs_type(struct file_system_type *, struct vfsmount *); +extern void umount_ve_fs_type(struct file_system_type *local_fs_type, int veid); +#define kern_umount mntput extern int may_umount_tree(struct vfsmount *); +extern struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); extern struct vfsmount *collect_mounts(struct path *); extern void drop_collected_mounts(struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); +extern int faudit_statfs(struct super_block *, struct kstatfs *); extern int current_umask(void); @@ -2068,7 +2106,8 @@ extern int check_disk_change(struct bloc extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes_check(struct super_block *, int check); +#define invalidate_inodes(sb) invalidate_inodes_check(sb, 0) unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -2482,6 +2521,17 @@ ssize_t simple_attr_read(struct file *fi ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +static inline void *file_private(struct file *file) +{ + struct file *host = file; + + while (host->f_op->get_host) { + host = host->f_op->get_host(host); + BUG_ON(host->f_mapping != file->f_mapping); + } + return host->private_data; +} + struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff -urNp linux-2.6.32.48/include/linux/fsnotify_backend.h linux-2.6.32.48-openvz/include/linux/fsnotify_backend.h --- linux-2.6.32.48/include/linux/fsnotify_backend.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/fsnotify_backend.h 2011-11-21 17:40:45.000000000 -0500 @@ -85,6 +85,7 @@ struct fsnotify_ops { void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group); void (*free_event_priv)(struct fsnotify_event_private_data *priv); + void (*detach_mnt)(struct fsnotify_mark_entry *e); }; /* @@ -348,6 +349,7 @@ extern void fsnotify_clear_marks_by_grou extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry); extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry); extern void fsnotify_unmount_inodes(struct list_head *list); +extern void fsnotify_unmount_mnt(struct vfsmount *mnt); /* put here because inotify does some weird stuff when destroying watches */ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, @@ -380,6 +382,7 @@ static inline u32 fsnotify_get_cookie(vo static inline void fsnotify_unmount_inodes(struct list_head *list) {} +static inline void fsnotify_unmount_mnt(struct vfsmount *mnt) { } #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ diff -urNp linux-2.6.32.48/include/linux/futex.h linux-2.6.32.48-openvz/include/linux/futex.h --- linux-2.6.32.48/include/linux/futex.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/futex.h 2011-11-21 17:40:45.000000000 -0500 @@ -132,6 +132,7 @@ union ktime; long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout, u32 __user *uaddr2, u32 val2, u32 val3); +long futex_wait_restart(struct restart_block *restart); extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); diff -urNp linux-2.6.32.48/include/linux/gfp.h linux-2.6.32.48-openvz/include/linux/gfp.h --- linux-2.6.32.48/include/linux/gfp.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/gfp.h 2011-11-21 17:40:45.000000000 -0500 @@ -52,6 +52,8 @@ struct vm_area_struct; #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ +#define __GFP_UBC ((__force gfp_t)0x100000u)/* charge kmem in buddy and slab */ +#define __GFP_SOFT_UBC ((__force gfp_t)0x400000u)/* use soft charging */ #ifdef CONFIG_KMEMCHECK #define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */ @@ -65,19 +67,22 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ #define GFP_ATOMIC (__GFP_HIGH) +#define GFP_ATOMIC_UBC (__GFP_HIGH | __GFP_UBC) #define GFP_NOIO (__GFP_WAIT) #define GFP_NOFS (__GFP_WAIT | __GFP_IO) #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) +#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC) #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ __GFP_RECLAIMABLE) #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC) #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ diff -urNp linux-2.6.32.48/include/linux/hardirq.h linux-2.6.32.48-openvz/include/linux/hardirq.h --- linux-2.6.32.48/include/linux/hardirq.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/hardirq.h 2011-11-21 17:40:45.000000000 -0500 @@ -10,6 +10,9 @@ #include #include +#include +#include + /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: @@ -157,6 +160,24 @@ extern void rcu_nmi_exit(void); # define rcu_nmi_exit() do { } while (0) #endif /* #if defined(CONFIG_NO_HZ) */ +#define save_context() do { \ + struct task_struct *tsk; \ + if (hardirq_count() == HARDIRQ_OFFSET) { \ + tsk = current; \ + ve_save_context(tsk); \ + ub_save_context(tsk); \ + } \ + } while (0) + +#define restore_context() do { \ + struct task_struct *tsk; \ + if (hardirq_count() == HARDIRQ_OFFSET) { \ + tsk = current; \ + ve_restore_context(tsk); \ + ub_restore_context(tsk); \ + } \ + } while (0) + /* * It is safe to do non-atomic ops on ->hardirq_context, * because NMI handlers may not preempt and the ops are @@ -167,6 +188,7 @@ extern void rcu_nmi_exit(void); do { \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ + save_context(); \ trace_hardirq_enter(); \ } while (0) @@ -182,6 +204,7 @@ extern void irq_enter(void); do { \ trace_hardirq_exit(); \ account_system_vtime(current); \ + restore_context(); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff -urNp linux-2.6.32.48/include/linux/hrtimer.h linux-2.6.32.48-openvz/include/linux/hrtimer.h --- linux-2.6.32.48/include/linux/hrtimer.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/hrtimer.h 2011-11-21 17:40:45.000000000 -0500 @@ -416,6 +416,9 @@ extern long hrtimer_nanosleep(struct tim const enum hrtimer_mode mode, const clockid_t clockid); extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); +#ifdef CONFIG_COMPAT +long compat_nanosleep_restart(struct restart_block *restart); +#endif extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); diff -urNp linux-2.6.32.48/include/linux/if_bridge.h linux-2.6.32.48-openvz/include/linux/if_bridge.h --- linux-2.6.32.48/include/linux/if_bridge.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/if_bridge.h 2011-11-21 17:40:45.000000000 -0500 @@ -42,6 +42,7 @@ #define BRCTL_SET_PORT_PRIORITY 16 #define BRCTL_SET_PATH_COST 17 #define BRCTL_GET_FDB_ENTRIES 18 +#define BRCTL_SET_VIA_ORIG_DEV 19 #define BR_STATE_DISABLED 0 #define BR_STATE_LISTENING 1 @@ -70,6 +71,7 @@ struct __bridge_info __u32 tcn_timer_value; __u32 topology_change_timer_value; __u32 gc_timer_value; + __u8 via_phys_dev; }; struct __port_info @@ -104,9 +106,12 @@ struct __fdb_entry #include +#define BR_ALREADY_SEEN 1 + extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff *skb); +extern int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port); extern int (*br_should_route_hook)(struct sk_buff *skb); #endif diff -urNp linux-2.6.32.48/include/linux/if_vlan.h linux-2.6.32.48-openvz/include/linux/if_vlan.h --- linux-2.6.32.48/include/linux/if_vlan.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/if_vlan.h 2011-11-21 17:40:45.000000000 -0500 @@ -84,6 +84,9 @@ struct vlan_group { struct hlist_node hlist; /* linked list */ struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS]; struct rcu_head rcu; +#ifdef CONFIG_VE + struct ve_struct *owner; +#endif }; static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, diff -urNp linux-2.6.32.48/include/linux/init_task.h linux-2.6.32.48-openvz/include/linux/init_task.h --- linux-2.6.32.48/include/linux/init_task.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/init_task.h 2011-11-21 17:40:45.000000000 -0500 @@ -11,6 +11,7 @@ #include #include #include +#include extern struct files_struct init_files; extern struct fs_struct init_fs; @@ -31,10 +32,17 @@ extern struct fs_struct init_fs; }, \ } +#ifdef CONFIG_VE +/* one for ve0, one for init_task */ +#define INIT_NSPROXY_COUNT ATOMIC_INIT(2) +#else +#define INIT_NSPROXY_COUNT ATOMIC_INIT(1) +#endif + extern struct nsproxy init_nsproxy; #define INIT_NSPROXY(nsproxy) { \ .pid_ns = &init_pid_ns, \ - .count = ATOMIC_INIT(1), \ + .count = INIT_NSPROXY_COUNT, \ .uts_ns = &init_uts_ns, \ .mnt_ns = NULL, \ INIT_NET_NS(net_ns) \ @@ -184,6 +192,7 @@ extern struct cred init_cred; INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_VZ_FAIRSCHED \ } diff -urNp linux-2.6.32.48/include/linux/inotify.h linux-2.6.32.48-openvz/include/linux/inotify.h --- linux-2.6.32.48/include/linux/inotify.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/inotify.h 2011-11-21 17:40:45.000000000 -0500 @@ -101,6 +101,11 @@ struct inotify_operations { void (*destroy_watch)(struct inotify_watch *); }; +struct fsnotify_group; +extern const struct file_operations inotify_fops; +int __inotify_new_watch(struct fsnotify_group *group, + struct path *path, __u32 mask, int wd); + #ifdef CONFIG_INOTIFY /* Kernel API for producing events */ diff -urNp linux-2.6.32.48/include/linux/iocontext.h linux-2.6.32.48-openvz/include/linux/iocontext.h --- linux-2.6.32.48/include/linux/iocontext.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/iocontext.h 2011-11-21 17:55:30.000000000 -0500 @@ -23,11 +23,6 @@ struct as_io_context { unsigned long ttime_total; unsigned long ttime_samples; unsigned long ttime_mean; - /* Layout pattern */ - unsigned int seek_samples; - sector_t last_request_pos; - u64 seek_total; - sector_t seek_mean; }; struct cfq_queue; @@ -68,6 +63,10 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; +#ifdef CONFIG_BLK_CGROUP + unsigned short cgroup_changed; +#endif + /* * For request batching */ diff -urNp linux-2.6.32.48/include/linux/ioprio.h linux-2.6.32.48-openvz/include/linux/ioprio.h --- linux-2.6.32.48/include/linux/ioprio.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/ioprio.h 2011-11-21 17:40:47.000000000 -0500 @@ -39,6 +39,7 @@ enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, + IOPRIO_WHO_UBC = 1000, }; /* diff -urNp linux-2.6.32.48/include/linux/ipc.h linux-2.6.32.48-openvz/include/linux/ipc.h --- linux-2.6.32.48/include/linux/ipc.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/ipc.h 2011-11-21 17:40:47.000000000 -0500 @@ -79,6 +79,7 @@ struct ipc_kludge { #ifdef __KERNEL__ #include +#include #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ @@ -98,6 +99,15 @@ struct kern_ipc_perm void *security; }; +struct ipc_ids; + +struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); +static inline void ipc_unlock(struct kern_ipc_perm *perm) +{ + spin_unlock(&perm->lock); + rcu_read_unlock(); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_IPC_H */ diff -urNp linux-2.6.32.48/include/linux/kdev_t.h linux-2.6.32.48-openvz/include/linux/kdev_t.h --- linux-2.6.32.48/include/linux/kdev_t.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/kdev_t.h 2011-11-21 17:40:47.000000000 -0500 @@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 de return dev & 0x3ffff; } +#define UNNAMED_MAJOR_COUNT 16 + +#if UNNAMED_MAJOR_COUNT > 1 + +extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT]; + +static inline dev_t make_unnamed_dev(int idx) +{ + /* + * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the + * unnamed device index into major number. + */ + return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)], + idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8)); +} + +static inline int unnamed_dev_idx(dev_t dev) +{ + int i; + for (i = 0; i < UNNAMED_MAJOR_COUNT && + MAJOR(dev) != unnamed_dev_majors[i]; i++); + return MINOR(dev) | (i << 8); +} + +static inline int is_unnamed_dev(dev_t dev) +{ + int i; + for (i = 0; i < UNNAMED_MAJOR_COUNT && + MAJOR(dev) != unnamed_dev_majors[i]; i++); + return i < UNNAMED_MAJOR_COUNT; +} + +#else /* UNNAMED_MAJOR_COUNT */ + +static inline dev_t make_unnamed_dev(int idx) +{ + return MKDEV(0, idx); +} + +static inline int unnamed_dev_idx(dev_t dev) +{ + return MINOR(dev); +} + +static inline int is_unnamed_dev(dev_t dev) +{ + return MAJOR(dev) == 0; +} + +#endif /* UNNAMED_MAJOR_COUNT */ + #else /* __KERNEL__ */ /* diff -urNp linux-2.6.32.48/include/linux/kernel.h linux-2.6.32.48-openvz/include/linux/kernel.h --- linux-2.6.32.48/include/linux/kernel.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/kernel.h 2011-11-21 17:40:47.000000000 -0500 @@ -245,6 +245,12 @@ extern struct ratelimit_state printk_rat extern int printk_ratelimit(void); extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) + __attribute__ ((format (printf, 2, 0))); +asmlinkage int ve_printk(int, const char * fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void prepare_printk(void); + extern int printk_delay_msec; @@ -272,6 +278,15 @@ static inline int printk_ratelimit(void) static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ unsigned int interval_msec) \ { return false; } +static inline int ve_printk(int d, const char *s, ...) + __attribute__ ((format (printf, 2, 3))); +static inline int ve_printk(int d, const char *s, ...) +{ + return 0; +} +static inline void prepare_printk(void) +{ +} /* No effect, but we still get type checking even in the !PRINTK case: */ #define printk_once(x...) printk(x) @@ -289,9 +304,17 @@ extern void asmlinkage __attribute__((fo unsigned long int_sqrt(unsigned long); +#define VE0_LOG 1 +#define VE_LOG 2 +#define VE_LOG_BOTH (VE0_LOG | VE_LOG) +extern int console_silence_loglevel; + static inline void console_silent(void) { - console_loglevel = 0; + if (console_loglevel > console_silence_loglevel) { + printk(KERN_EMERG "console shuts up ...\n"); + console_loglevel = 0; + } } static inline void console_verbose(void) @@ -305,6 +328,7 @@ extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; +extern int decode_call_traces; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern const char *print_tainted(void); diff -urNp linux-2.6.32.48/include/linux/kobject.h linux-2.6.32.48-openvz/include/linux/kobject.h --- linux-2.6.32.48/include/linux/kobject.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/kobject.h 2011-11-21 17:40:47.000000000 -0500 @@ -51,6 +51,8 @@ enum kobject_action { KOBJ_REMOVE, KOBJ_CHANGE, KOBJ_MOVE, + KOBJ_START, + KOBJ_STOP, KOBJ_ONLINE, KOBJ_OFFLINE, KOBJ_MAX diff -urNp linux-2.6.32.48/include/linux/kthread.h linux-2.6.32.48-openvz/include/linux/kthread.h --- linux-2.6.32.48/include/linux/kthread.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/kthread.h 2011-11-21 17:40:47.000000000 -0500 @@ -4,10 +4,19 @@ #include #include -struct task_struct *kthread_create(int (*threadfn)(void *data), +struct task_struct *kthread_create_ve(struct ve_struct *ve, + int (*threadfn)(void *data), void *data, const char namefmt[], ...) - __attribute__((format(printf, 3, 4))); + __attribute__((format(printf, 4, 5))); + +#define kthread_create(threadfn, data, namefmt, ...) \ +({ \ + struct task_struct *__k \ + = kthread_create_ve(get_ve0(), threadfn, data, namefmt, \ + ## __VA_ARGS__); \ + __k; \ +}) /** * kthread_run - create and wake a thread. @@ -27,6 +36,17 @@ struct task_struct *kthread_create(int ( __k; \ }) +/* Like kthread_run() but run a thread in VE context */ +#define kthread_run_ve(ve, threadfn, data, namefmt, ...) \ +({ \ + struct task_struct *__k \ + = kthread_create_ve(ve, threadfn, data, namefmt, \ + ## __VA_ARGS__); \ + if (!IS_ERR(__k)) \ + wake_up_process(__k); \ + __k; \ +}) + void kthread_bind(struct task_struct *k, unsigned int cpu); int kthread_stop(struct task_struct *k); int kthread_should_stop(void); diff -urNp linux-2.6.32.48/include/linux/lockd/lockd.h linux-2.6.32.48-openvz/include/linux/lockd/lockd.h --- linux-2.6.32.48/include/linux/lockd/lockd.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/lockd/lockd.h 2011-11-21 17:40:47.000000000 -0500 @@ -66,6 +66,7 @@ struct nlm_host { struct list_head h_reclaim; /* Locks in RECLAIM state */ struct nsm_handle *h_nsmhandle; /* NSM status handle */ char *h_addrbuf; /* address eyecatcher */ + struct ve_struct * owner_env; /* VE owning the host */ }; /* @@ -192,8 +193,10 @@ extern struct svc_procedure nlmsvc_proce #ifdef CONFIG_LOCKD_V4 extern struct svc_procedure nlmsvc_procedures4[]; #endif -extern int nlmsvc_grace_period; -extern unsigned long nlmsvc_timeout; + +#include +extern unsigned long _nlmsvc_timeout; + extern int nsm_use_hostnames; extern u32 nsm_local_state; diff -urNp linux-2.6.32.48/include/linux/major.h linux-2.6.32.48-openvz/include/linux/major.h --- linux-2.6.32.48/include/linux/major.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/major.h 2011-11-21 17:40:47.000000000 -0500 @@ -174,4 +174,7 @@ #define BLOCK_EXT_MAJOR 259 #define SCSI_OSD_MAJOR 260 /* open-osd's OSD scsi device */ +#define UNNAMED_EXTRA_MAJOR 130 +#define UNNAMED_EXTRA_MAJOR_COUNT 120 + #endif diff -urNp linux-2.6.32.48/include/linux/mman.h linux-2.6.32.48-openvz/include/linux/mman.h --- linux-2.6.32.48/include/linux/mman.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/mman.h 2011-11-21 17:40:47.000000000 -0500 @@ -85,6 +85,9 @@ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | +#ifdef MAP_GROWSUP + _calc_vm_trans(flags, MAP_GROWSUP, VM_GROWSUP ) | +#endif _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); diff -urNp linux-2.6.32.48/include/linux/mm.h linux-2.6.32.48-openvz/include/linux/mm.h --- linux-2.6.32.48/include/linux/mm.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/mm.h 2011-11-21 17:40:47.000000000 -0500 @@ -716,6 +716,7 @@ extern void pagefault_out_of_memory(void extern void show_free_areas(void); int shmem_lock(struct file *file, int lock, struct user_struct *user); +#define shmem_nopage filemap_nopage struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); int shmem_zero_setup(struct vm_area_struct *); @@ -780,7 +781,9 @@ int walk_page_range(unsigned long addr, void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, + unsigned long addr, size_t size); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pfn(struct vm_area_struct *vma, unsigned long address, @@ -836,7 +839,7 @@ int __set_page_dirty_nobuffers(struct pa int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping); +int account_page_dirtied(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); @@ -1306,7 +1309,12 @@ unsigned long shrink_slab(unsigned long #ifndef CONFIG_MMU #define randomize_va_space 0 #else -extern int randomize_va_space; +extern int _randomize_va_space; +#ifndef CONFIG_VE +#define randomize_va_space _randomize_va_space +#else +#define randomize_va_space (get_exec_env()->_randomize_va_space) +#endif #endif const char * arch_vma_name(struct vm_area_struct *vma); diff -urNp linux-2.6.32.48/include/linux/mm_types.h linux-2.6.32.48-openvz/include/linux/mm_types.h --- linux-2.6.32.48/include/linux/mm_types.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/mm_types.h 2011-11-21 17:40:47.000000000 -0500 @@ -106,6 +106,14 @@ struct page { */ void *shadow; #endif +#ifdef CONFIG_BEANCOUNTERS + /* FIXME: switch to mainline memcgroup */ + union { + struct user_beancounter *page_ub; + struct page_beancounter *page_pb; + struct user_beancounter **slub_ubs; + } bc; +#endif }; /* @@ -260,6 +268,12 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ + unsigned int vps_dumpable:2; + unsigned int oom_killed:1; + +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *mm_ub; +#endif struct core_state *core_state; /* coredumping support */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; diff -urNp linux-2.6.32.48/include/linux/mnt_namespace.h linux-2.6.32.48-openvz/include/linux/mnt_namespace.h --- linux-2.6.32.48/include/linux/mnt_namespace.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/mnt_namespace.h 2011-11-21 17:40:47.000000000 -0500 @@ -26,6 +26,8 @@ struct fs_struct; extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt); extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); +extern struct rw_semaphore namespace_sem; + extern void put_mnt_ns(struct mnt_namespace *ns); static inline void get_mnt_ns(struct mnt_namespace *ns) { diff -urNp linux-2.6.32.48/include/linux/mount.h linux-2.6.32.48-openvz/include/linux/mount.h --- linux-2.6.32.48/include/linux/mount.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/mount.h 2011-11-21 17:40:47.000000000 -0500 @@ -70,6 +70,7 @@ struct vfsmount { #else int mnt_writers; #endif + unsigned owner; }; static inline int *get_mnt_writers_ptr(struct vfsmount *mnt) diff -urNp linux-2.6.32.48/include/linux/msg.h linux-2.6.32.48-openvz/include/linux/msg.h --- linux-2.6.32.48/include/linux/msg.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/msg.h 2011-11-21 17:40:47.000000000 -0500 @@ -107,6 +107,14 @@ extern long do_msgsnd(int msqid, long mt extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext, size_t msgsz, long msgtyp, int msgflg); +int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg); +int sysvipc_setup_msg(key_t key, int msqid, int msgflg); +int sysv_msg_store(struct msg_msg *msg, + int (*store)(void * src, int len, int offset, void * data), + int len, void * data); +struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset, + void * data), int len, void * data); + #endif /* __KERNEL__ */ #endif /* _LINUX_MSG_H */ diff -urNp linux-2.6.32.48/include/linux/namei.h linux-2.6.32.48-openvz/include/linux/namei.h --- linux-2.6.32.48/include/linux/namei.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/namei.h 2011-11-21 17:40:47.000000000 -0500 @@ -56,6 +56,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA #define LOOKUP_CREATE 0x0200 #define LOOKUP_EXCL 0x0400 #define LOOKUP_RENAME_TARGET 0x0800 +#define LOOKUP_NOAREACHECK 0x1000 /* no area check on lookup */ +#define LOOKUP_STRICT 0x2000 /* no symlinks or other filesystems */ extern int user_path_at(int, const char __user *, unsigned, struct path *); diff -urNp linux-2.6.32.48/include/linux/netdevice.h linux-2.6.32.48-openvz/include/linux/netdevice.h --- linux-2.6.32.48/include/linux/netdevice.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netdevice.h 2011-11-21 17:40:47.000000000 -0500 @@ -300,6 +300,11 @@ enum netdev_state_t __LINK_STATE_DORMANT, }; +struct netdev_bc { + struct user_beancounter *exec_ub, *owner_ub; +}; + +#define netdev_bc(dev) (&(dev)->dev_bc) /* * This structure holds at boot time configured netdevice settings. They @@ -485,6 +490,10 @@ struct netdev_queue { unsigned long tx_dropped; } ____cacheline_aligned_in_smp; +struct cpt_context; +struct cpt_ops; +struct rst_ops; +struct cpt_netdev_image; /* * This structure defines the management hooks for network devices. @@ -636,8 +645,23 @@ struct net_device_ops { int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); #endif + void (*ndo_cpt)(struct net_device *dev, + struct cpt_ops *, + struct cpt_context *); +}; + +struct netdev_rst { + int cpt_object; + int (*ndo_rst)(loff_t, struct cpt_netdev_image *, + struct rst_ops *, + struct cpt_context *); + struct list_head list; }; +void register_netdev_rst(struct netdev_rst *ops); +void unregister_netdev_rst(struct netdev_rst *ops); +struct netdev_rst *netdev_find_rst(int cpt_object, struct netdev_rst *ops); + /* * The DEVICE structure. * Actually, this whole structure is a big mistake. It mixes I/O @@ -708,6 +732,8 @@ struct net_device #define NETIF_F_FCOE_CRC (1 << 24) /* FCoE CRC32 */ #define NETIF_F_SCTP_CSUM (1 << 25) /* SCTP checksum offload */ #define NETIF_F_FCOE_MTU (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/ +#define NETIF_F_VENET (1 << 27) /* device is venet device */ +#define NETIF_F_VIRTUAL (1 << 28) /* can be registered inside VE */ /* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 @@ -892,6 +918,9 @@ struct net_device /* GARP */ struct garp_port *garp_port; + struct ve_struct *owner_env; /* Owner VE of the interface */ + struct netdev_bc dev_bc; + /* class/net/name entry */ struct device dev; /* space for optional statistics and wireless sysfs groups */ @@ -919,6 +948,20 @@ struct net_device }; #define to_net_dev(d) container_of(d, struct net_device, dev) +#define NETDEV_HASHBITS 8 +#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) + +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) +{ + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; +} + +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +{ + return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; +} + #define NETDEV_ALIGN 32 static inline @@ -1496,6 +1539,8 @@ extern int dev_ethtool(struct net *net, extern unsigned dev_get_flags(const struct net_device *); extern int dev_change_flags(struct net_device *, unsigned); extern int dev_change_name(struct net_device *, const char *); +int __dev_change_net_namespace(struct net_device *, struct net *, const char *, + struct user_beancounter *exec_ub); extern int dev_set_alias(struct net_device *, const char *, size_t); extern int dev_change_net_namespace(struct net_device *, struct net *, const char *); @@ -1919,6 +1964,18 @@ unsigned long netdev_increment_features( unsigned long mask); unsigned long netdev_fix_features(unsigned long features, const char *name); +#if defined(CONFIG_VE) && defined(CONFIG_NET) +static inline int ve_is_dev_movable(struct net_device *dev) +{ + return !(dev->features & (NETIF_F_VIRTUAL | NETIF_F_NETNS_LOCAL)); +} +#else +static inline int ve_is_dev_movable(struct net_device *dev) +{ + return 0; +} +#endif + static inline int net_gso_ok(int features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; diff -urNp linux-2.6.32.48/include/linux/netfilter/x_tables.h linux-2.6.32.48-openvz/include/linux/netfilter/x_tables.h --- linux-2.6.32.48/include/linux/netfilter/x_tables.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter/x_tables.h 2011-11-21 17:40:47.000000000 -0500 @@ -375,6 +375,7 @@ struct xt_table_info { /* Size per table */ unsigned int size; + unsigned int alloc_size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ @@ -605,6 +606,23 @@ extern int xt_compat_target_to_user(stru void __user **dstptr, unsigned int *size); #endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_VE +static inline bool ve_xt_table_forbidden(struct xt_table *xt) +{ + /* + * The only purpose to have this check as a separate + * helper is "grep"-a-bility + * + * If this helper hit it means that a VE has been + * configured without the particular xt_table support + */ + return xt == NULL; +} +#else +static inline bool ve_xt_table_forbidden(struct xt_table *xt) { return true; } +#endif + #endif /* __KERNEL__ */ #endif /* _X_TABLES_H */ diff -urNp linux-2.6.32.48/include/linux/netfilter/xt_connmark.h linux-2.6.32.48-openvz/include/linux/netfilter/xt_connmark.h --- linux-2.6.32.48/include/linux/netfilter/xt_connmark.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter/xt_connmark.h 2011-11-21 17:40:47.000000000 -0500 @@ -12,6 +12,11 @@ * (at your option) any later version. */ +struct xt_connmark_info { + unsigned long mark, mask; + __u8 invert; +}; + struct xt_connmark_mtinfo1 { __u32 mark, mask; __u8 invert; diff -urNp linux-2.6.32.48/include/linux/netfilter/xt_CONNMARK.h linux-2.6.32.48-openvz/include/linux/netfilter/xt_CONNMARK.h --- linux-2.6.32.48/include/linux/netfilter/xt_CONNMARK.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter/xt_CONNMARK.h 2011-11-21 17:40:47.000000000 -0500 @@ -18,6 +18,12 @@ enum { XT_CONNMARK_RESTORE }; +struct xt_connmark_target_info { + unsigned long mark; + unsigned long mask; + __u8 mode; +}; + struct xt_connmark_tginfo1 { __u32 ctmark, ctmask, nfmask; __u8 mode; diff -urNp linux-2.6.32.48/include/linux/netfilter/xt_conntrack.h linux-2.6.32.48-openvz/include/linux/netfilter/xt_conntrack.h --- linux-2.6.32.48/include/linux/netfilter/xt_conntrack.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter/xt_conntrack.h 2011-11-21 17:40:47.000000000 -0500 @@ -32,6 +32,42 @@ enum { XT_CONNTRACK_DIRECTION = 1 << 12, }; +/* This is exposed to userspace, so remains frozen in time. */ +struct ip_conntrack_old_tuple +{ + struct { + __be32 ip; + union { + __u16 all; + } u; + } src; + + struct { + __be32 ip; + union { + __u16 all; + } u; + + /* The protocol. */ + __u16 protonum; + } dst; +}; + +struct xt_conntrack_info +{ + unsigned int statemask, statusmask; + + struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX]; + struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX]; + + unsigned long expires_min, expires_max; + + /* Flags word */ + __u8 flags; + /* Inverse flags */ + __u8 invflags; +}; + struct xt_conntrack_mtinfo1 { union nf_inet_addr origsrc_addr, origsrc_mask; union nf_inet_addr origdst_addr, origdst_mask; diff -urNp linux-2.6.32.48/include/linux/netfilter/xt_hashlimit.h linux-2.6.32.48-openvz/include/linux/netfilter/xt_hashlimit.h --- linux-2.6.32.48/include/linux/netfilter/xt_hashlimit.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter/xt_hashlimit.h 2011-11-21 17:40:47.000000000 -0500 @@ -65,4 +65,11 @@ struct xt_hashlimit_mtinfo1 { struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); }; +#ifdef __KERNEL__ +struct ve_xt_hashlimit { + struct hlist_head hashlimit_htables; + struct proc_dir_entry *hashlimit_procdir4; + struct proc_dir_entry *hashlimit_procdir6; +}; +#endif #endif /*_XT_HASHLIMIT_H*/ diff -urNp linux-2.6.32.48/include/linux/netfilter/xt_mark.h linux-2.6.32.48-openvz/include/linux/netfilter/xt_mark.h --- linux-2.6.32.48/include/linux/netfilter/xt_mark.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter/xt_mark.h 2011-11-21 17:40:47.000000000 -0500 @@ -3,6 +3,11 @@ #include +struct xt_mark_info { + unsigned long mark, mask; + __u8 invert; +}; + struct xt_mark_mtinfo1 { __u32 mark, mask; __u8 invert; diff -urNp linux-2.6.32.48/include/linux/netfilter/xt_MARK.h linux-2.6.32.48-openvz/include/linux/netfilter/xt_MARK.h --- linux-2.6.32.48/include/linux/netfilter/xt_MARK.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter/xt_MARK.h 2011-11-21 17:40:47.000000000 -0500 @@ -3,6 +3,23 @@ #include +/* Version 0 */ +struct xt_mark_target_info { + unsigned long mark; +}; + +/* Version 1 */ +enum { + XT_MARK_SET=0, + XT_MARK_AND, + XT_MARK_OR, +}; + +struct xt_mark_target_info_v1 { + unsigned long mark; + __u8 mode; +}; + struct xt_mark_tginfo2 { __u32 mark, mask; }; diff -urNp linux-2.6.32.48/include/linux/netfilter/xt_recent.h linux-2.6.32.48-openvz/include/linux/netfilter/xt_recent.h --- linux-2.6.32.48/include/linux/netfilter/xt_recent.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter/xt_recent.h 2011-11-21 17:40:47.000000000 -0500 @@ -25,4 +25,15 @@ struct xt_recent_mtinfo { __u8 side; }; +#ifdef __KERNEL__ +struct ve_ipt_recent { + struct list_head tables; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_dir; +#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT + struct proc_dir_entry *proc_old_dir; +#endif +#endif +}; +#endif #endif /* _LINUX_NETFILTER_XT_RECENT_H */ diff -urNp linux-2.6.32.48/include/linux/netfilter.h linux-2.6.32.48-openvz/include/linux/netfilter.h --- linux-2.6.32.48/include/linux/netfilter.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter.h 2011-11-21 17:40:47.000000000 -0500 @@ -353,5 +353,28 @@ extern void (*nf_ct_destroy)(struct nf_c static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif +#ifdef CONFIG_VE_IPTABLES +#include + +#define net_ipt_permitted(netns, ipt) \ + (mask_ipt_allow((netns)->owner_ve->ipt_mask, ipt)) + +#define net_ipt_module_permitted(netns, ipt) \ + (mask_ipt_allow((netns)->owner_ve->ipt_mask, ipt) && \ + mask_ipt_allow((netns)->owner_ve->_iptables_modules, \ + (ipt) & ~(ipt##_MOD))) + +#define net_ipt_module_set(netns, ipt) \ + ({ \ + (netns)->owner_ve->_iptables_modules |= ipt##_MOD; \ + }) +#define net_is_ipt_module_set(netns, ipt) \ + ((netns)->owner_ve->_iptables_modules & (ipt##_MOD)) +#else +#define net_ipt_module_permitted(netns, ipt) (1) +#define net_ipt_module_set(netns, ipt) +#define net_is_ipt_module_set(netns, ipt) (1) +#endif + #endif /*__KERNEL__*/ #endif /*__LINUX_NETFILTER_H*/ diff -urNp linux-2.6.32.48/include/linux/netfilter_ipv4/ipt_iprange.h linux-2.6.32.48-openvz/include/linux/netfilter_ipv4/ipt_iprange.h --- linux-2.6.32.48/include/linux/netfilter_ipv4/ipt_iprange.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter_ipv4/ipt_iprange.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,23 @@ +#ifndef _IPT_IPRANGE_H +#define _IPT_IPRANGE_H + +#define IPRANGE_SRC 0x01 /* Match source IP address */ +#define IPRANGE_DST 0x02 /* Match destination IP address */ +#define IPRANGE_SRC_INV 0x10 /* Negate the condition */ +#define IPRANGE_DST_INV 0x20 /* Negate the condition */ + +struct ipt_iprange { + /* Inclusive: network order. */ + u_int32_t min_ip, max_ip; +}; + +struct ipt_iprange_info +{ + struct ipt_iprange src; + struct ipt_iprange dst; + + /* Flags from above */ + u_int8_t flags; +}; + +#endif /* _IPT_IPRANGE_H */ diff -urNp linux-2.6.32.48/include/linux/netfilter_ipv4/ipt_owner.h linux-2.6.32.48-openvz/include/linux/netfilter_ipv4/ipt_owner.h --- linux-2.6.32.48/include/linux/netfilter_ipv4/ipt_owner.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter_ipv4/ipt_owner.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,20 @@ +#ifndef _IPT_OWNER_H +#define _IPT_OWNER_H + +/* match and invert flags */ +#define IPT_OWNER_UID 0x01 +#define IPT_OWNER_GID 0x02 +#define IPT_OWNER_PID 0x04 +#define IPT_OWNER_SID 0x08 +#define IPT_OWNER_COMM 0x10 + +struct ipt_owner_info { + uid_t uid; + gid_t gid; + pid_t pid; + pid_t sid; + char comm[16]; + u_int8_t match, invert; /* flags */ +}; + +#endif /*_IPT_OWNER_H*/ diff -urNp linux-2.6.32.48/include/linux/netfilter_ipv4/ipt_tos.h linux-2.6.32.48-openvz/include/linux/netfilter_ipv4/ipt_tos.h --- linux-2.6.32.48/include/linux/netfilter_ipv4/ipt_tos.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter_ipv4/ipt_tos.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,13 @@ +#ifndef _IPT_TOS_H +#define _IPT_TOS_H + +struct ipt_tos_info { + u_int8_t tos; + u_int8_t invert; +}; + +#ifndef IPTOS_NORMALSVC +#define IPTOS_NORMALSVC 0 +#endif + +#endif /*_IPT_TOS_H*/ diff -urNp linux-2.6.32.48/include/linux/netfilter_ipv4/ipt_TOS.h linux-2.6.32.48-openvz/include/linux/netfilter_ipv4/ipt_TOS.h --- linux-2.6.32.48/include/linux/netfilter_ipv4/ipt_TOS.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter_ipv4/ipt_TOS.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,12 @@ +#ifndef _IPT_TOS_H_target +#define _IPT_TOS_H_target + +#ifndef IPTOS_NORMALSVC +#define IPTOS_NORMALSVC 0 +#endif + +struct ipt_tos_target_info { + u_int8_t tos; +}; + +#endif /*_IPT_TOS_H_target*/ diff -urNp linux-2.6.32.48/include/linux/netfilter_ipv6/ip6t_owner.h linux-2.6.32.48-openvz/include/linux/netfilter_ipv6/ip6t_owner.h --- linux-2.6.32.48/include/linux/netfilter_ipv6/ip6t_owner.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/netfilter_ipv6/ip6t_owner.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,18 @@ +#ifndef _IP6T_OWNER_H +#define _IP6T_OWNER_H + +/* match and invert flags */ +#define IP6T_OWNER_UID 0x01 +#define IP6T_OWNER_GID 0x02 +#define IP6T_OWNER_PID 0x04 +#define IP6T_OWNER_SID 0x08 + +struct ip6t_owner_info { + uid_t uid; + gid_t gid; + pid_t pid; + pid_t sid; + u_int8_t match, invert; /* flags */ +}; + +#endif /*_IPT_OWNER_H*/ diff -urNp linux-2.6.32.48/include/linux/nfs_fs.h linux-2.6.32.48-openvz/include/linux/nfs_fs.h --- linux-2.6.32.48/include/linux/nfs_fs.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/nfs_fs.h 2011-11-21 17:40:47.000000000 -0500 @@ -374,7 +374,7 @@ extern const struct address_space_operat static inline struct nfs_open_context *nfs_file_open_context(struct file *filp) { - return filp->private_data; + return file_private(filp); } static inline struct rpc_cred *nfs_file_cred(struct file *file) diff -urNp linux-2.6.32.48/include/linux/nfs_fs_sb.h linux-2.6.32.48-openvz/include/linux/nfs_fs_sb.h --- linux-2.6.32.48/include/linux/nfs_fs_sb.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/nfs_fs_sb.h 2011-11-21 17:40:47.000000000 -0500 @@ -91,6 +91,7 @@ struct nfs_client { #ifdef CONFIG_NFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif + struct ve_struct *owner_env; }; /* diff -urNp linux-2.6.32.48/include/linux/nmi.h linux-2.6.32.48-openvz/include/linux/nmi.h --- linux-2.6.32.48/include/linux/nmi.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/nmi.h 2011-11-21 17:40:47.000000000 -0500 @@ -47,4 +47,6 @@ static inline bool trigger_all_cpu_backt } #endif +extern void nmi_show_regs(struct pt_regs *regs, int in_nmi); +extern int do_nmi_show_regs(struct pt_regs *regs, int cpu); #endif diff -urNp linux-2.6.32.48/include/linux/notifier.h linux-2.6.32.48-openvz/include/linux/notifier.h --- linux-2.6.32.48/include/linux/notifier.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/notifier.h 2011-11-21 17:40:47.000000000 -0500 @@ -153,8 +153,9 @@ extern int __srcu_notifier_call_chain(st #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ +#define NOTIFY_FAIL 0x0002 /* Reject */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ -#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) +#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) /* Bad/Veto action */ /* * Clean way to return from the notifier and stop further calls. diff -urNp linux-2.6.32.48/include/linux/nsproxy.h linux-2.6.32.48-openvz/include/linux/nsproxy.h --- linux-2.6.32.48/include/linux/nsproxy.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/nsproxy.h 2011-11-21 17:40:47.000000000 -0500 @@ -62,10 +62,11 @@ static inline struct nsproxy *task_nspro return rcu_dereference(tsk->nsproxy); } -int copy_namespaces(unsigned long flags, struct task_struct *tsk); +int copy_namespaces(unsigned long flags, struct task_struct *tsk, int force_admin); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); +struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct fs_struct *); @@ -76,9 +77,10 @@ static inline void put_nsproxy(struct ns } } -static inline void get_nsproxy(struct nsproxy *ns) +static inline struct nsproxy *get_nsproxy(struct nsproxy *ns) { atomic_inc(&ns->count); + return ns; } #ifdef CONFIG_CGROUP_NS diff -urNp linux-2.6.32.48/include/linux/page-flags.h linux-2.6.32.48-openvz/include/linux/page-flags.h --- linux-2.6.32.48/include/linux/page-flags.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/page-flags.h 2011-11-21 17:40:47.000000000 -0500 @@ -209,6 +209,7 @@ __PAGEFLAG(Slab, slab) PAGEFLAG(Checked, checked) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ PAGEFLAG(SavePinned, savepinned); /* Xen */ +PAGEFLAG(Checkpointed, owner_priv_1) PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) diff -urNp linux-2.6.32.48/include/linux/pid.h linux-2.6.32.48-openvz/include/linux/pid.h --- linux-2.6.32.48/include/linux/pid.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/pid.h 2011-11-21 17:40:47.000000000 -0500 @@ -60,6 +60,9 @@ struct pid unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; +#endif struct rcu_head rcu; struct upid numbers[1]; }; @@ -96,6 +99,11 @@ extern void change_pid(struct task_struc struct pid *pid); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); +extern void reattach_pid(struct task_struct *, enum pid_type, struct pid *); +extern int alloc_pidmap(struct pid_namespace *pid_ns); +extern int set_pidmap(struct pid_namespace *pid_ns, pid_t pid); + +extern spinlock_t pidmap_lock; struct pid_namespace; extern struct pid_namespace init_pid_ns; @@ -119,8 +127,11 @@ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); int next_pidmap(struct pid_namespace *pid_ns, unsigned int last); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid); extern void free_pid(struct pid *pid); +extern int pid_ns_attach_init(struct pid_namespace *, struct task_struct *); +extern int pid_ns_attach_task(struct pid_namespace *, struct task_struct *); +pid_t pid_to_vpid(pid_t nr); /* * ns_of_pid() returns the pid namespace in which the specified pid was @@ -185,7 +196,7 @@ pid_t pid_vnr(struct pid *pid); do { #define while_each_pid_thread(pid, type, task) \ - } while_each_thread(tg___, task); \ + } while_each_thread_ve(tg___, task); \ task = tg___; \ } while_each_pid_task(pid, type, task) #endif /* _LINUX_PID_H */ diff -urNp linux-2.6.32.48/include/linux/pid_namespace.h linux-2.6.32.48-openvz/include/linux/pid_namespace.h --- linux-2.6.32.48/include/linux/pid_namespace.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/pid_namespace.h 2011-11-21 17:40:47.000000000 -0500 @@ -16,6 +16,14 @@ struct pidmap { struct bsd_acct_struct; +/* pid namespace flags */ + +/* if set newly created pid ns got PID_NS_HIDE_CHILD flag */ +#define PID_NS_HIDE_CHILD 0x00000001 + +/* if set newly created processes invisible from parent ns*/ +#define PID_NS_HIDDEN 0x00000002 + struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; @@ -24,6 +32,7 @@ struct pid_namespace { struct kmem_cache *pid_cachep; unsigned int level; struct pid_namespace *parent; + unsigned flags; #ifdef CONFIG_PROC_FS struct vfsmount *proc_mnt; #endif diff -urNp linux-2.6.32.48/include/linux/poll.h linux-2.6.32.48-openvz/include/linux/poll.h --- linux-2.6.32.48/include/linux/poll.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/poll.h 2011-11-21 17:40:47.000000000 -0500 @@ -133,6 +133,7 @@ extern int core_sys_select(int n, fd_set fd_set __user *exp, struct timespec *end_time); extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec); +long do_restart_poll(struct restart_block *restart_block); #endif /* KERNEL */ diff -urNp linux-2.6.32.48/include/linux/proc_fs.h linux-2.6.32.48-openvz/include/linux/proc_fs.h --- linux-2.6.32.48/include/linux/proc_fs.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/proc_fs.h 2011-11-21 17:40:47.000000000 -0500 @@ -103,9 +103,14 @@ struct vmcore { #ifdef CONFIG_PROC_FS extern void proc_root_init(void); +extern struct file_system_type proc_fs_type; +extern const struct file_operations proc_kmsg_operations; void proc_flush_task(struct task_struct *task); +extern int proc_dentry_of_dead_task(struct dentry *dentry); +extern struct file_operations dummy_proc_pid_file_operations; + extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent); struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, @@ -149,6 +154,8 @@ extern struct proc_dir_entry *proc_mkdir extern struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, struct proc_dir_entry *parent); +extern struct proc_dir_entry glob_proc_root; + static inline struct proc_dir_entry *proc_create(const char *name, mode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops) { @@ -184,6 +191,8 @@ extern void dup_mm_exe_file(struct mm_st #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) static inline void proc_net_remove(struct net *net, const char *name) {} +static inline int proc_dentry_of_dead_task(struct dentry *dentry) { return 0; } + static inline void proc_flush_task(struct task_struct *task) { } @@ -268,6 +277,9 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; +#ifdef CONFIG_VE + struct proc_dir_entry *lpde; +#endif struct inode vfs_inode; }; @@ -281,6 +293,15 @@ static inline struct proc_dir_entry *PDE return PROC_I(inode)->pde; } +static inline struct proc_dir_entry *LPDE(const struct inode *inode) +{ +#ifdef CONFIG_VE + return PROC_I(inode)->lpde; +#else + return NULL; +#endif +} + static inline struct net *PDE_NET(struct proc_dir_entry *pde) { return pde->parent->data; diff -urNp linux-2.6.32.48/include/linux/quota.h linux-2.6.32.48-openvz/include/linux/quota.h --- linux-2.6.32.48/include/linux/quota.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/quota.h 2011-11-21 17:40:47.000000000 -0500 @@ -173,6 +173,10 @@ enum { #include #include +#include + +extern spinlock_t dq_data_lock; + #include #include #include @@ -291,6 +295,8 @@ struct quota_format_ops { int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ }; +struct inode; +struct iattr; /* Operations working with dquots */ struct dquot_operations { int (*initialize) (struct inode *, int); @@ -316,9 +322,14 @@ struct dquot_operations { /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); + int (*rename) (struct inode *, struct inode *, struct inode *); + + void (*swap_inode) (struct inode *, struct inode *); + void (*shutdown) (struct super_block *); }; /* Operations handling requests from userspace */ +struct v2_disk_dqblk; struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, char *, int); int (*quota_off)(struct super_block *, int, int); @@ -331,6 +342,10 @@ struct quotactl_ops { int (*set_xstate)(struct super_block *, unsigned int, int); int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); +#ifdef CONFIG_QUOTA_COMPAT + int (*get_quoti)(struct super_block *, int, unsigned int, + struct v2_disk_dqblk __user *); +#endif }; struct quota_format_type { @@ -385,6 +400,10 @@ struct quota_info { struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + struct vz_quota_master *vzdq_master; + int vzdq_count; +#endif }; int register_quota_format(struct quota_format_type *fmt); diff -urNp linux-2.6.32.48/include/linux/quotaops.h linux-2.6.32.48-openvz/include/linux/quotaops.h --- linux-2.6.32.48/include/linux/quotaops.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/quotaops.h 2011-11-21 17:40:47.000000000 -0500 @@ -264,6 +264,19 @@ static inline void vfs_dq_free_inode(str inode->i_sb->dq_op->free_inode(inode, 1); } +static __inline__ int vfs_dq_rename(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + const struct dquot_operations *q_op; + + q_op = inode->i_sb->dq_op; + if (q_op && q_op->rename) { + if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA) + return 1; + } + return 0; +} + /* Cannot be called inside a transaction */ static inline int vfs_dq_off(struct super_block *sb, int remount) { @@ -274,6 +287,35 @@ static inline int vfs_dq_off(struct supe return ret; } +static __inline__ void DQUOT_SWAP(struct inode *inode, struct inode *tmpl) +{ + if (sb_any_quota_active(tmpl->i_sb) && + tmpl->i_sb->dq_op->swap_inode) + tmpl->i_sb->dq_op->swap_inode(inode, tmpl); +} + +static __inline__ int DQUOT_CHECK_SPACE(struct inode *inode) +{ + if (vfs_dq_alloc_space_nodirty(inode, 512)) + return -EDQUOT; + vfs_dq_free_space_nodirty(inode, 512); + return 0; +} + +static __inline__ void DQUOT_SYNC_BLOCKS(struct inode *inode, blkcnt_t blocks) +{ + if (sb_any_quota_active(inode->i_sb)) { + if (blocks > inode->i_blocks) + inode->i_sb->dq_op->alloc_space(inode, + (qsize_t)(blocks-inode->i_blocks)*512, + 13 /*DQUOT_CMD_FORCE*/); + else if (blocks < inode->i_blocks) + inode->i_sb->dq_op->free_space(inode, (qsize_t)(inode->i_blocks-blocks)*512); + } else + inode->i_blocks = blocks; +} + + #else static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) @@ -363,6 +405,12 @@ static inline int vfs_dq_transfer(struct return 0; } +static inline int vfs_dq_rename(struct inode *inode, struct inode *old_dir, + struct inode *new_dir) +{ + return 0; +} + static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) { inode_add_bytes(inode, nr); @@ -416,6 +464,15 @@ static inline void vfs_dq_free_space(str mark_inode_dirty(inode); } +static inline void DQUOT_SWAP(struct inode *inode, struct inode *tmpl) +{ +} + +static inline void DQUOT_SYNC_BLOCKS(struct inode *inode, blkcnt_t blocks) +{ + inode->i_blocks = blocks; +} + #endif /* CONFIG_QUOTA */ static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr) diff -urNp linux-2.6.32.48/include/linux/rmap.h linux-2.6.32.48-openvz/include/linux/rmap.h --- linux-2.6.32.48/include/linux/rmap.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/rmap.h 2011-11-21 17:40:47.000000000 -0500 @@ -70,6 +70,8 @@ void page_add_anon_rmap(struct page *, s void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *); +struct anon_vma *page_lock_anon_vma(struct page *page); +void page_unlock_anon_vma(struct anon_vma *anon_vma); static inline void page_dup_rmap(struct page *page) { diff -urNp linux-2.6.32.48/include/linux/sched.h linux-2.6.32.48-openvz/include/linux/sched.h --- linux-2.6.32.48/include/linux/sched.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/sched.h 2011-11-21 17:40:47.000000000 -0500 @@ -94,6 +94,8 @@ struct sched_param { #include +#include + struct exec_domain; struct futex_pi_state; struct robust_list_head; @@ -120,6 +122,8 @@ struct perf_event_context; */ extern unsigned long avenrun[]; /* Load averages */ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); +extern void get_avenrun_ve(struct ve_struct *ve, unsigned long *loads, + unsigned long offset, int shift); #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1<>= FSHIFT; +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + extern unsigned long total_forks; extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); +extern unsigned long nr_sleeping(void); +extern unsigned long nr_stopped(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(void); extern unsigned long this_cpu_load(void); +extern atomic_t nr_dead; +extern unsigned long nr_zombie; + +#ifdef CONFIG_VE +struct ve_struct; +extern unsigned long nr_running_ve(struct ve_struct *); +extern unsigned long nr_iowait_ve(struct ve_struct *); +extern unsigned long nr_uninterruptible_ve(struct ve_struct *); +extern cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu); +extern cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu); +void ve_sched_attach(struct ve_struct *envid); +#else +#define nr_running_ve(ve) 0 +#define nr_iowait_ve(ve) 0 +#define nr_uninterruptible_ve(ve) 0 +#define ve_sched_get_idle_time(ve, cpu) 0 +#define ve_sched_get_iowait_time(ve, cpu) 0 +#endif extern void calc_global_load(void); @@ -552,6 +579,9 @@ struct thread_group_cputimer { spinlock_t lock; }; +#include +#include + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -1282,6 +1312,7 @@ struct task_struct { unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ unsigned in_iowait:1; + unsigned did_ve_enter:1; /* Revert to default priority/policy when forking */ @@ -1497,6 +1528,14 @@ struct task_struct { struct rcu_head rcu; /* + * state tracking for suspend + * FIXME - ptrace is completely rewritten in this kernel + * so set_pn_state() is not set in many places correctyl + */ + __u8 pn_state; + __u8 stopped_state:1; + + /* * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; @@ -1540,6 +1579,19 @@ struct task_struct { /* bitmask of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter task_bc; +#endif +#ifdef CONFIG_VE + struct ve_task_info ve_task_info; +#endif +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + unsigned long magic; + struct inode *ino; +#endif +#ifdef CONFIG_VZ_FAIRSCHED + struct fairsched_node *fsched_node; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ @@ -1726,6 +1778,43 @@ extern cputime_t task_stime(struct task_ extern cputime_t task_gtime(struct task_struct *p); extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); +#ifndef CONFIG_VE +#define set_pn_state(tsk, state) do { } while(0) +#define clear_pn_state(tsk) do { } while(0) +#define set_stop_state(tsk) do { } while(0) +#define clear_stop_state(tsk) do { } while(0) +#else +#define PN_STOP_TF 1 /* was not in 2.6.8 */ +#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */ +#define PN_STOP_ENTRY 3 +#define PN_STOP_FORK 4 +#define PN_STOP_VFORK 5 +#define PN_STOP_SIGNAL 6 +#define PN_STOP_EXIT 7 +#define PN_STOP_EXEC 8 +#define PN_STOP_LEAVE 9 + +static inline void set_pn_state(struct task_struct *tsk, int state) +{ + tsk->pn_state = state; +} + +static inline void clear_pn_state(struct task_struct *tsk) +{ + tsk->pn_state = 0; +} + +static inline void set_stop_state(struct task_struct *tsk) +{ + tsk->stopped_state = 1; +} + +static inline void clear_stop_state(struct task_struct *tsk) +{ + tsk->stopped_state = 0; +} +#endif + /* * Per process flags */ @@ -1734,6 +1823,7 @@ extern void thread_group_times(struct ta #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_EXIT_RESTART 0x00000020 /* do_exit() restarted, see do_exit() */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ @@ -1883,6 +1973,21 @@ extern unsigned long long task_sched_runtime(struct task_struct *task); extern unsigned long long thread_group_sched_runtime(struct task_struct *task); +static inline unsigned long cycles_to_clocks(cycles_t cycles) +{ + extern unsigned long cycles_per_clock; + do_div(cycles, cycles_per_clock); + return cycles; +} + +static inline u64 cycles_to_jiffies(cycles_t cycles) +{ + extern unsigned long cycles_per_jiffy; + do_div(cycles, cycles_per_jiffy); + return cycles; +} + + /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); @@ -2163,6 +2268,13 @@ extern int disallow_signal(int); extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern long do_fork_pid(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + long pid0); struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); @@ -2180,11 +2292,11 @@ static inline unsigned long wait_task_in } #endif -#define next_task(p) \ +#define next_task_all(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) -#define for_each_process(p) \ - for (p = &init_task ; (p = next_task(p)) != &init_task ; ) +#define for_each_process_all(p) \ + for (p = &init_task ; (p = next_task_all(p)) != &init_task ; ) extern bool current_is_single_threaded(void); @@ -2192,10 +2304,10 @@ extern bool current_is_single_threaded(v * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ -#define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do +#define do_each_thread_all(g, t) \ + for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do -#define while_each_thread(g, t) \ +#define while_each_thread_all(g, t) \ while ((t = next_thread(t)) != g) /* de_thread depends on thread_group_leader not being a pid based check */ @@ -2220,8 +2332,14 @@ int same_thread_group(struct task_struct static inline struct task_struct *next_thread(const struct task_struct *p) { - return list_entry_rcu(p->thread_group.next, + struct task_struct *tsk; + tsk = list_entry_rcu(p->thread_group.next, struct task_struct, thread_group); +#ifdef CONFIG_VE + /* all threads should belong to ONE ve! */ + BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env); +#endif + return tsk; } static inline int thread_group_empty(struct task_struct *p) @@ -2266,6 +2384,98 @@ static inline void unlock_task_sighand(s spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } +#ifndef CONFIG_VE + +#define for_each_process_ve(p) for_each_process_all(p) +#define do_each_thread_ve(g, t) do_each_thread_all(g, t) +#define while_each_thread_ve(g, t) while_each_thread_all(g, t) +#define first_task_ve() next_task_ve(&init_task) +#define __first_task_ve(owner) next_task_ve(&init_task) +#define __next_task_ve(owner, p) next_task_ve(p) +#define next_task_ve(p) \ + (next_task_all(p) != &init_task ? next_task_all(p) : NULL) + +#define ve_is_super(env) 1 +#define ve_accessible(target, owner) 1 +#define ve_accessible_strict(target, owner) 1 +#define ve_accessible_veid(target, owner) 1 +#define ve_accessible_strict_veid(target, owner) 1 + +#define VEID(ve) 0 + +#else /* CONFIG_VE */ + +#include + +#define ve_is_super(env) ((env) == get_ve0()) + +#define ve_accessible_strict(target, owner) ((target) == (owner)) +static inline int ve_accessible(struct ve_struct *target, + struct ve_struct *owner) +{ + return ve_is_super(owner) || ve_accessible_strict(target, owner); +} + +#define ve_accessible_strict_veid(target, owner) ((target) == (owner)) +static inline int ve_accessible_veid(envid_t target, envid_t owner) +{ + return get_ve0()->veid == owner || + ve_accessible_strict_veid(target, owner); +} + +#define VEID(ve) (ve->veid) + +static inline struct task_struct *ve_lh2task(struct ve_struct *ve, + struct list_head *lh) +{ + return lh == &ve->vetask_lh ? NULL : + list_entry(lh, struct task_struct, ve_task_info.vetask_list); +} + +static inline struct task_struct *__first_task_ve(struct ve_struct *ve) +{ + struct task_struct *tsk; + + if (unlikely(ve_is_super(ve))) { + tsk = next_task_all(&init_task); + if (tsk == &init_task) + tsk = NULL; + } else { + tsk = ve_lh2task(ve, rcu_dereference(ve->vetask_lh.next)); + } + return tsk; +} + +static inline struct task_struct *__next_task_ve(struct ve_struct *ve, + struct task_struct *tsk) +{ + if (unlikely(ve_is_super(ve))) { + tsk = next_task_all(tsk); + if (tsk == &init_task) + tsk = NULL; + } else { + BUG_ON(tsk->ve_task_info.owner_env != ve); + tsk = ve_lh2task(ve, rcu_dereference(tsk-> + ve_task_info.vetask_list.next)); + } + return tsk; +} + +#define first_task_ve() __first_task_ve(get_exec_env()) +#define next_task_ve(p) __next_task_ve(get_exec_env(), p) +/* no one uses prev_task_ve(), copy next_task_ve() if needed */ + +#define for_each_process_ve(p) \ + for (p = first_task_ve(); p != NULL ; p = next_task_ve(p)) + +#define do_each_thread_ve(g, t) \ + for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do + +#define while_each_thread_ve(g, t) \ + while ((t = next_thread(t)) != g) + +#endif /* CONFIG_VE */ + #ifndef __HAVE_THREAD_FUNCTIONS #define task_thread_info(task) ((struct thread_info *)(task)->stack) diff -urNp linux-2.6.32.48/include/linux/sem.h linux-2.6.32.48-openvz/include/linux/sem.h --- linux-2.6.32.48/include/linux/sem.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/sem.h 2011-11-21 17:40:47.000000000 -0500 @@ -154,6 +154,9 @@ static inline void exit_sem(struct task_ } #endif +int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg); +int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg); + #endif /* __KERNEL__ */ #endif /* _LINUX_SEM_H */ diff -urNp linux-2.6.32.48/include/linux/shmem_fs.h linux-2.6.32.48-openvz/include/linux/shmem_fs.h --- linux-2.6.32.48/include/linux/shmem_fs.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/shmem_fs.h 2011-11-21 17:40:47.000000000 -0500 @@ -18,6 +18,9 @@ struct shmem_inode_info { struct page *i_indirect; /* top indirect blocks page */ swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ struct list_head swaplist; /* chain of maybes on swap */ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *shmi_ub; +#endif struct inode vfs_inode; }; @@ -57,4 +60,7 @@ static inline int shmem_acl_init(struct } #endif /* CONFIG_TMPFS_POSIX_ACL */ +int shmem_insertpage(struct inode * inode, unsigned long index, + swp_entry_t swap); + #endif diff -urNp linux-2.6.32.48/include/linux/shm.h linux-2.6.32.48-openvz/include/linux/shm.h --- linux-2.6.32.48/include/linux/shm.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/shm.h 2011-11-21 17:40:47.000000000 -0500 @@ -83,6 +83,22 @@ struct shm_info { }; #ifdef __KERNEL__ + +#include + +#define IPC_SEM_IDS 0 +#define IPC_MSG_IDS 1 +#define IPC_SHM_IDS 2 + +struct shm_file_data { + int id; + struct ipc_namespace *ns; + struct file *file; + const struct vm_operations_struct *vm_ops; +}; +#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) +#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) + struct shmid_kernel /* private to the kernel */ { struct kern_ipc_perm shm_perm; @@ -97,6 +113,23 @@ struct shmid_kernel /* private to the ke struct user_struct *mlock_user; }; +/* + * shm_lock_(check_) routines are called in the paths where the rw_mutex + * is not held. + */ +static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return (struct shmid_kernel *)ipcp; + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +#define shm_unlock(shp) \ + ipc_unlock(&(shp)->shm_perm) + /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ #define SHM_LOCKED 02000 /* segment will not be swapped */ @@ -118,6 +151,12 @@ static inline int is_file_shm_hugepages( } #endif +int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg); +struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg); +extern const struct file_operations shmem_file_operations; +extern const struct file_operations shm_file_operations; + +extern struct file_system_type tmpfs_fs_type; #endif /* __KERNEL__ */ #endif /* _LINUX_SHM_H_ */ diff -urNp linux-2.6.32.48/include/linux/signalfd.h linux-2.6.32.48-openvz/include/linux/signalfd.h --- linux-2.6.32.48/include/linux/signalfd.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/signalfd.h 2011-11-21 17:40:47.000000000 -0500 @@ -60,6 +60,12 @@ static inline void signalfd_notify(struc wake_up(&tsk->sighand->signalfd_wqh); } +struct signalfd_ctx { + sigset_t sigmask; +}; + +extern long do_signalfd(int ufd, sigset_t *sigmask, int flags); + #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } diff -urNp linux-2.6.32.48/include/linux/signal.h linux-2.6.32.48-openvz/include/linux/signal.h --- linux-2.6.32.48/include/linux/signal.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/signal.h 2011-11-21 17:40:47.000000000 -0500 @@ -6,6 +6,8 @@ #ifdef __KERNEL__ #include +#include +#include /* * Real Time signals may be queued. @@ -16,6 +18,9 @@ struct sigqueue { int flags; siginfo_t info; struct user_struct *user; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *sig_ub; +#endif }; /* flags values. */ @@ -376,6 +381,8 @@ int unhandled_signal(struct task_struct void signals_init(void); +extern struct kmem_cache *sigqueue_cachep; + #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNAL_H */ diff -urNp linux-2.6.32.48/include/linux/skbuff.h linux-2.6.32.48-openvz/include/linux/skbuff.h --- linux-2.6.32.48/include/linux/skbuff.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/skbuff.h 2011-11-21 17:40:47.000000000 -0500 @@ -310,6 +310,8 @@ typedef unsigned char *sk_buff_data_t; * @vlan_tci: vlan tag control information */ +#include + struct sk_buff { /* These two members must be first. */ struct sk_buff *next; @@ -357,6 +359,13 @@ struct sk_buff { __be16 protocol:16; kmemcheck_bitfield_end(flags1); +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) + __u8 brmark; +#endif +#ifdef CONFIG_VE + unsigned int accounted:1; + unsigned int redirected:1; +#endif void (*destructor)(struct sk_buff *skb); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; @@ -404,6 +413,8 @@ struct sk_buff { *data; unsigned int truesize; atomic_t users; + struct skb_beancounter skb_bc; + struct ve_struct *owner_env; }; #ifdef __KERNEL__ @@ -411,6 +422,7 @@ struct sk_buff { * Handling routines are only of interest to the kernel */ #include +#include #include @@ -1422,6 +1434,9 @@ static inline void pskb_trim_unique(stru */ static inline void skb_orphan(struct sk_buff *skb) { + if (skb->sk) + ub_skb_uncharge(skb); + if (skb->destructor) skb->destructor(skb); skb->destructor = NULL; @@ -2008,6 +2023,26 @@ static inline void skb_init_secmark(stru { } #endif +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from) +{ + to->brmark = from->brmark; +} + +static inline void skb_init_brmark(struct sk_buff *skb) +{ + skb->brmark = 0; +} +#else +static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from) +{ +} + +static inline void skb_init_brmark(struct sk_buff *skb) +{ +} +#endif + static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) { skb->queue_mapping = queue_mapping; diff -urNp linux-2.6.32.48/include/linux/slab_def.h linux-2.6.32.48-openvz/include/linux/slab_def.h --- linux-2.6.32.48/include/linux/slab_def.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/slab_def.h 2011-11-21 17:40:47.000000000 -0500 @@ -17,6 +17,26 @@ #include /* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define SLAB_DEBUG 1 +#define SLAB_STATS 1 +#define SLAB_FORCED_DEBUG 1 +#else +#define SLAB_DEBUG 0 +#define SLAB_STATS 0 +#define SLAB_FORCED_DEBUG 0 +#endif + +/* * struct kmem_cache * * manages a cache. @@ -64,6 +84,7 @@ struct kmem_cache { unsigned long high_mark; unsigned long grown; unsigned long reaped; + unsigned long shrunk; unsigned long errors; unsigned long max_freeable; unsigned long node_allocs; @@ -83,6 +104,9 @@ struct kmem_cache { int obj_offset; int obj_size; #endif /* CONFIG_DEBUG_SLAB */ +#ifdef CONFIG_BEANCOUNTERS + int objuse; +#endif /* * We put nodelists[] at the end of kmem_cache, because we want to size @@ -106,6 +130,7 @@ struct cache_sizes { #endif }; extern struct cache_sizes malloc_sizes[]; +extern int malloc_cache_num; void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); @@ -145,6 +170,8 @@ static __always_inline void *kmalloc(siz #undef CACHE return NULL; found: + if (flags & __GFP_UBC) + i += malloc_cache_num; #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) cachep = malloc_sizes[i].cs_dmacachep; diff -urNp linux-2.6.32.48/include/linux/slab.h linux-2.6.32.48-openvz/include/linux/slab.h --- linux-2.6.32.48/include/linux/slab.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/slab.h 2011-11-21 17:40:47.000000000 -0500 @@ -88,6 +88,26 @@ (unsigned long)ZERO_SIZE_PTR) /* + * allocation rules: __GFP_UBC 0 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * cache (SLAB_UBC) charge charge + * (usual caches: mm, vma, task_struct, ...) + * + * cache (SLAB_UBC | SLAB_NO_CHARGE) charge --- + * (ub_kmalloc) (kmalloc) + * + * cache (no UB flags) BUG() --- + * (nonub caches, mempools) + * + * pages charge --- + * (ub_vmalloc, (vmalloc, + * poll, fdsets, ...) non-ub allocs) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#define SLAB_UBC 0x10000000UL /* alloc space for ubs ... */ +#define SLAB_NO_CHARGE 0x20000000UL /* ... but don't charge */ + +/* * struct kmem_cache related prototypes */ void __init kmem_cache_init(void); @@ -102,7 +122,24 @@ void kmem_cache_free(struct kmem_cache * unsigned int kmem_cache_size(struct kmem_cache *); const char *kmem_cache_name(struct kmem_cache *); int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); - +#ifdef CONFIG_SLABINFO +extern void show_slab_info(void); +#else +#define show_slab_info() do { } while (0) +#endif +int kmem_cache_objuse(struct kmem_cache *cachep); +int kmem_obj_objuse(void *obj); +int kmem_dname_objuse(void *obj); +unsigned long ub_cache_growth(struct kmem_cache *cachep); + +#ifdef CONFIG_BEANCOUNTERS +void kmem_mark_nocharge(struct kmem_cache *cachep); +struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj); +struct user_beancounter *slab_ub(void *obj); +#else +static inline void kmem_mark_nocharge(struct kmem_cache *cachep) { } +static inline struct user_beancounter *slab_ub(void *obj) { return NULL; } +#endif /* * Please use this macro to create slab caches. Simply specify the * name of the structure and maybe some flags that are listed above. diff -urNp linux-2.6.32.48/include/linux/slub_def.h linux-2.6.32.48-openvz/include/linux/slub_def.h --- linux-2.6.32.48/include/linux/slub_def.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/slub_def.h 2011-11-21 17:40:47.000000000 -0500 @@ -97,6 +97,10 @@ struct kmem_cache { struct kobject kobj; /* For sysfs */ #endif +#ifdef CONFIG_BEANCOUNTERS + atomic_t grown; + int objuse; +#endif #ifdef CONFIG_NUMA /* * Defragmentation by allocating from a remote node. @@ -141,6 +145,19 @@ struct kmem_cache { */ extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT]; +#ifdef CONFIG_BEANCOUNTERS +extern struct kmem_cache ub_kmalloc_caches[SLUB_PAGE_SHIFT]; +static inline struct kmem_cache *__kmalloc_cache(gfp_t f, int idx) +{ + return (f & __GFP_UBC) ? &ub_kmalloc_caches[idx] : &kmalloc_caches[idx]; +} +#else +static inline struct kmem_cache *__kmalloc_cache(gfp_t flags, int idx) +{ + return &kmalloc_caches[idx]; +} +#endif + /* * Sorry that the following has to be that ugly but some versions of GCC * have trouble with constant propagation and loops. @@ -197,14 +214,14 @@ static __always_inline int kmalloc_index * This ought to end up with a global pointer to the right cache * in kmalloc_caches. */ -static __always_inline struct kmem_cache *kmalloc_slab(size_t size) +static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { int index = kmalloc_index(size); if (index == 0) return NULL; - return &kmalloc_caches[index]; + return __kmalloc_cache(flags, index); } #ifdef CONFIG_ZONE_DMA @@ -247,7 +264,7 @@ static __always_inline void *kmalloc(siz return kmalloc_large(size, flags); if (!(flags & SLUB_DMA)) { - struct kmem_cache *s = kmalloc_slab(size); + struct kmem_cache *s = kmalloc_slab(size, flags); if (!s) return ZERO_SIZE_PTR; @@ -286,7 +303,7 @@ static __always_inline void *kmalloc_nod if (__builtin_constant_p(size) && size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) { - struct kmem_cache *s = kmalloc_slab(size); + struct kmem_cache *s = kmalloc_slab(size, flags); if (!s) return ZERO_SIZE_PTR; diff -urNp linux-2.6.32.48/include/linux/socket.h linux-2.6.32.48-openvz/include/linux/socket.h --- linux-2.6.32.48/include/linux/socket.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/socket.h 2011-11-21 17:40:47.000000000 -0500 @@ -296,6 +296,16 @@ struct ucred { #define IPX_TYPE 1 #ifdef __KERNEL__ + +#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - + 16 for IP, 16 for IPX, + 24 for IPv6, + about 80 for AX.25 + must be at least one bigger than + the AF_UNIX size (see net/unix/af_unix.c + :unix_mkname()). + */ + extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, int offset, int len); @@ -311,6 +321,8 @@ extern int memcpy_toiovecend(const struc extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, int __user *ulen); extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int vz_security_family_check(int family); +extern int vz_security_protocol_check(int protocol); #endif #endif /* not kernel and not glibc */ diff -urNp linux-2.6.32.48/include/linux/sunrpc/clnt.h linux-2.6.32.48-openvz/include/linux/sunrpc/clnt.h --- linux-2.6.32.48/include/linux/sunrpc/clnt.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/sunrpc/clnt.h 2011-11-21 17:40:47.000000000 -0500 @@ -50,6 +50,7 @@ struct rpc_clnt { cl_discrtry : 1,/* disconnect before retry */ cl_autobind : 1,/* use getport() */ cl_chatty : 1;/* be verbose */ + unsigned int cl_broken : 1;/* no responce for too long */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ const struct rpc_timeout *cl_timeout; /* Timeout strategy */ @@ -61,6 +62,7 @@ struct rpc_clnt { struct rpc_rtt cl_rtt_default; struct rpc_timeout cl_timeout_default; struct rpc_program * cl_program; + unsigned long cl_pr_time; char cl_inline_name[32]; char *cl_principal; /* target to authenticate to */ }; diff -urNp linux-2.6.32.48/include/linux/sunrpc/sched.h linux-2.6.32.48-openvz/include/linux/sunrpc/sched.h --- linux-2.6.32.48/include/linux/sunrpc/sched.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/sunrpc/sched.h 2011-11-21 17:40:47.000000000 -0500 @@ -216,6 +216,7 @@ void rpc_put_task(struct rpc_task *); void rpc_exit_task(struct rpc_task *); void rpc_release_calldata(const struct rpc_call_ops *, void *); void rpc_killall_tasks(struct rpc_clnt *); +void rpc_kill_client(struct rpc_clnt *); void rpc_execute(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); @@ -239,6 +240,7 @@ void rpc_show_tasks(void); int rpc_init_mempool(void); void rpc_destroy_mempool(void); extern struct workqueue_struct *rpciod_workqueue; +extern struct rw_semaphore rpc_async_task_lock; void rpc_prepare_task(struct rpc_task *task); static inline void rpc_exit(struct rpc_task *task, int status) diff -urNp linux-2.6.32.48/include/linux/sunrpc/xprt.h linux-2.6.32.48-openvz/include/linux/sunrpc/xprt.h --- linux-2.6.32.48/include/linux/sunrpc/xprt.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/sunrpc/xprt.h 2011-11-21 17:40:47.000000000 -0500 @@ -24,6 +24,14 @@ #define RPC_MAX_SLOT_TABLE (128U) /* + * Grand abort timeout (stop the client if occures) + */ +extern int xprt_abort_timeout; + +#define RPC_MIN_ABORT_TIMEOUT 300 +#define RPC_MAX_ABORT_TIMEOUT INT_MAX + +/* * This describes a timeout strategy */ struct rpc_timeout { @@ -144,6 +152,7 @@ enum xprt_transports { struct rpc_xprt { struct kref kref; /* Reference count */ struct rpc_xprt_ops * ops; /* transport methods */ + struct ve_struct * owner_env; /* VE owner of mount */ const struct rpc_timeout *timeout; /* timeout parms */ struct sockaddr_storage addr; /* server address */ diff -urNp linux-2.6.32.48/include/linux/swap.h linux-2.6.32.48-openvz/include/linux/swap.h --- linux-2.6.32.48/include/linux/swap.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/swap.h 2011-11-21 17:40:47.000000000 -0500 @@ -19,6 +19,7 @@ struct bio; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 +#define SWAP_FLAG_READONLY 0x40000000 /* set if swap is read-only */ static inline int current_is_kswapd(void) { @@ -116,6 +117,7 @@ struct address_space; struct sysinfo; struct writeback_control; struct zone; +struct user_beancounter; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of @@ -146,6 +148,7 @@ enum { SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ /* add others here before... */ + SWP_READONLY = (1 << 5), SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; @@ -158,6 +161,7 @@ enum { /* * The in-memory structure used to track swap areas. */ +struct user_beancounter; struct swap_info_struct { unsigned long flags; int prio; /* swap priority */ @@ -177,6 +181,9 @@ struct swap_info_struct { unsigned int max; unsigned int inuse_pages; unsigned int old_block_size; +#ifdef CONFIG_BC_SWAP_ACCOUNTING + struct user_beancounter **swap_ubs; +#endif }; struct swap_list_t { @@ -184,9 +191,21 @@ struct swap_list_t { int next; /* swapfile to be used next */ }; +extern struct swap_list_t swap_list; +extern struct swap_info_struct swap_info[MAX_SWAPFILES]; + /* Swap 50% full? Release swapcache more aggressively.. */ #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) +/* linux/mm/oom_kill.c */ +extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); +extern int register_oom_notifier(struct notifier_block *nb); +extern int unregister_oom_notifier(struct notifier_block *nb); +extern int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + struct mem_cgroup *mem, const char *message); +extern struct task_struct *select_bad_process(struct user_beancounter *ub, + struct mem_cgroup *memcg); + /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; @@ -284,6 +303,7 @@ extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern void __delete_from_swap_cache(struct page *); +extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); @@ -297,7 +317,7 @@ extern struct page *swapin_readahead(swp extern long nr_swap_pages; extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page(struct user_beancounter *); extern swp_entry_t get_swap_page_of_type(int); extern void swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); @@ -312,6 +332,7 @@ extern sector_t swapdev_block(int, pgoff extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); +extern int swap_readonly(struct page *); struct backing_dev_info; /* linux/mm/thrash.c */ @@ -428,7 +449,7 @@ static inline int try_to_free_swap(struc return 0; } -static inline swp_entry_t get_swap_page(void) +static inline swp_entry_t get_swap_page(struct user_beancounter *ub) { swp_entry_t entry; entry.val = 0; diff -urNp linux-2.6.32.48/include/linux/sysctl.h linux-2.6.32.48-openvz/include/linux/sysctl.h --- linux-2.6.32.48/include/linux/sysctl.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/sysctl.h 2011-11-21 17:40:47.000000000 -0500 @@ -1103,10 +1103,15 @@ struct ctl_table_header *__register_sysc struct ctl_table_header *register_sysctl_table(struct ctl_table * table); struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, struct ctl_table *table); +struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *, int); +struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *, + struct ctl_table *, int); void unregister_sysctl_table(struct ctl_table_header * table); int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table); +extern int ve_allow_kthreads; + #endif /* __KERNEL__ */ #endif /* _LINUX_SYSCTL_H */ diff -urNp linux-2.6.32.48/include/linux/sysfs.h linux-2.6.32.48-openvz/include/linux/sysfs.h --- linux-2.6.32.48/include/linux/sysfs.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/sysfs.h 2011-11-21 17:40:47.000000000 -0500 @@ -17,8 +17,23 @@ #include #include +#ifdef CONFIG_SYSFS_DEPRECATED_DYN +extern unsigned sysfs_deprecated; +#else + +/* static deprecation */ + +#ifdef CONFIG_SYSFS_DEPRECATED +#define sysfs_deprecated 1 +#else +#define sysfs_deprecated 0 +#endif + +#endif + struct kobject; struct module; +struct sysfs_open_dirent; /* FIXME * The *owner field is no longer used. @@ -38,7 +53,7 @@ struct attribute_group { struct attribute **attrs; }; - +#include /** * Use these macros to make defining attributes easier. See include/linux/device.h @@ -81,6 +96,73 @@ struct sysfs_ops { struct sysfs_dirent; +/* type-specific structures for sysfs_dirent->s_* union members */ +struct sysfs_elem_dir { + struct kobject *kobj; + /* children list starts here and goes through sd->s_sibling */ + struct sysfs_dirent *children; +}; + +struct sysfs_elem_symlink { + struct sysfs_dirent *target_sd; +}; + +struct sysfs_elem_attr { + struct attribute *attr; + struct sysfs_open_dirent *open; +}; + +struct sysfs_elem_bin_attr { + struct bin_attribute *bin_attr; + struct hlist_head buffers; +}; + +struct sysfs_inode_attrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; +}; + +/* + * sysfs_dirent - the building block of sysfs hierarchy. Each and + * every sysfs node is represented by single sysfs_dirent. + * + * As long as s_count reference is held, the sysfs_dirent itself is + * accessible. Dereferencing s_elem or any other outer entity + * requires s_active reference. + */ +struct sysfs_dirent { + atomic_t s_count; + atomic_t s_active; + struct sysfs_dirent *s_parent; + struct sysfs_dirent *s_sibling; + const char *s_name; + + union { + struct sysfs_elem_dir s_dir; + struct sysfs_elem_symlink s_symlink; + struct sysfs_elem_attr s_attr; + struct sysfs_elem_bin_attr s_bin_attr; + }; + + unsigned int s_flags; + ino_t s_ino; + umode_t s_mode; + struct sysfs_inode_attrs *s_iattr; +}; + +#define SD_DEACTIVATED_BIAS INT_MIN + +#define SYSFS_TYPE_MASK 0x00ff +#define SYSFS_DIR 0x0001 +#define SYSFS_KOBJ_ATTR 0x0002 +#define SYSFS_KOBJ_BIN_ATTR 0x0004 +#define SYSFS_KOBJ_LINK 0x0008 +#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) + +#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK +#define SYSFS_FLAG_REMOVED 0x0200 + #ifdef CONFIG_SYSFS int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), @@ -129,6 +211,8 @@ void sysfs_put(struct sysfs_dirent *sd); void sysfs_printk_last_file(void); int __must_check sysfs_init(void); +extern struct file_system_type sysfs_fs_type; + #else /* CONFIG_SYSFS */ static inline int sysfs_schedule_callback(struct kobject *kobj, diff -urNp linux-2.6.32.48/include/linux/task_io_accounting_ops.h linux-2.6.32.48-openvz/include/linux/task_io_accounting_ops.h --- linux-2.6.32.48/include/linux/task_io_accounting_ops.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/task_io_accounting_ops.h 2011-11-21 17:40:47.000000000 -0500 @@ -5,10 +5,12 @@ #define __TASK_IO_ACCOUNTING_OPS_INCLUDED #include +#include #ifdef CONFIG_TASK_IO_ACCOUNTING static inline void task_io_account_read(size_t bytes) { + ub_io_account_read(bytes); current->ioac.read_bytes += bytes; } @@ -21,8 +23,14 @@ static inline unsigned long task_io_get_ return p->ioac.read_bytes >> 9; } -static inline void task_io_account_write(size_t bytes) +static inline void task_io_account_write(struct page *page, size_t bytes, + int sync) { + if (sync) + ub_io_account_write(bytes); + else + ub_io_account_dirty(page, bytes); + current->ioac.write_bytes += bytes; } @@ -37,6 +45,7 @@ static inline unsigned long task_io_get_ static inline void task_io_account_cancelled_write(size_t bytes) { + ub_io_account_write_cancelled(bytes); current->ioac.cancelled_write_bytes += bytes; } @@ -64,7 +73,8 @@ static inline unsigned long task_io_get_ return 0; } -static inline void task_io_account_write(size_t bytes) +static inline void task_io_account_write(struct page *page, size_t bytes, + int sync) { } diff -urNp linux-2.6.32.48/include/linux/tcp.h linux-2.6.32.48-openvz/include/linux/tcp.h --- linux-2.6.32.48/include/linux/tcp.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/tcp.h 2011-11-21 17:40:47.000000000 -0500 @@ -413,6 +413,11 @@ static inline struct tcp_sock *tcp_sk(co return (struct tcp_sock *)sk; } +static inline int tcp_urg_mode(const struct tcp_sock *tp) +{ + return tp->snd_una != tp->snd_up; +} + struct tcp_timewait_sock { struct inet_timewait_sock tw_sk; u32 tw_rcv_nxt; diff -urNp linux-2.6.32.48/include/linux/tty_driver.h linux-2.6.32.48-openvz/include/linux/tty_driver.h --- linux-2.6.32.48/include/linux/tty_driver.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/tty_driver.h 2011-11-21 17:40:47.000000000 -0500 @@ -309,8 +309,19 @@ struct tty_driver { const struct tty_operations *ops; struct list_head tty_drivers; + struct ve_struct *owner_env; }; +#ifdef CONFIG_UNIX98_PTYS +extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ +extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ +#endif + +#ifdef CONFIG_LEGACY_PTYS +extern struct tty_driver *pty_driver; +extern struct tty_driver *pty_slave_driver; +#endif + extern struct list_head tty_drivers; extern struct tty_driver *alloc_tty_driver(int lines); @@ -319,6 +330,9 @@ extern void tty_set_operations(struct tt const struct tty_operations *op); extern struct tty_driver *tty_find_polling_driver(char *name, int *line); +int init_ve_tty_class(void); +void fini_ve_tty_class(void); + extern void tty_driver_kref_put(struct tty_driver *driver); static inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d) diff -urNp linux-2.6.32.48/include/linux/tty.h linux-2.6.32.48-openvz/include/linux/tty.h --- linux-2.6.32.48/include/linux/tty.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/tty.h 2011-11-21 17:40:47.000000000 -0500 @@ -313,6 +313,7 @@ struct tty_struct { /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; + struct ve_struct *owner_env; }; /* tty magic number */ @@ -344,6 +345,7 @@ struct tty_struct { #define TTY_HUPPED 18 /* Post driver->hangup() */ #define TTY_FLUSHING 19 /* Flushing to ldisc in progress */ #define TTY_FLUSHPENDING 20 /* Queued buffer flush pending */ +#define TTY_CHARGED 21 /* Charged as ub resource */ #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) @@ -449,7 +451,7 @@ extern void free_tty_struct(struct tty_s extern void initialize_tty_struct(struct tty_struct *tty, struct tty_driver *driver, int idx); extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx, - int first_ok); + struct tty_struct *i_tty, int first_ok); extern void tty_release_dev(struct file *filp); extern int tty_init_termios(struct tty_struct *tty); diff -urNp linux-2.6.32.48/include/linux/types.h linux-2.6.32.48-openvz/include/linux/types.h --- linux-2.6.32.48/include/linux/types.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/types.h 2011-11-21 17:40:47.000000000 -0500 @@ -31,6 +31,11 @@ typedef __kernel_timer_t timer_t; typedef __kernel_clockid_t clockid_t; typedef __kernel_mqd_t mqd_t; +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + typedef _Bool bool; typedef __kernel_uid32_t uid_t; diff -urNp linux-2.6.32.48/include/linux/utsname.h linux-2.6.32.48-openvz/include/linux/utsname.h --- linux-2.6.32.48/include/linux/utsname.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/utsname.h 2011-11-21 17:40:47.000000000 -0500 @@ -42,6 +42,7 @@ struct uts_namespace { struct new_utsname name; }; extern struct uts_namespace init_uts_ns; +extern struct new_utsname virt_utsname; #ifdef CONFIG_UTS_NS static inline void get_uts_ns(struct uts_namespace *ns) diff -urNp linux-2.6.32.48/include/linux/ve.h linux-2.6.32.48-openvz/include/linux/ve.h --- linux-2.6.32.48/include/linux/ve.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/ve.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,365 @@ +/* + * include/linux/ve.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VE_H +#define _LINUX_VE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef VZMON_DEBUG +# define VZTRACE(fmt,args...) \ + printk(KERN_DEBUG fmt, ##args) +#else +# define VZTRACE(fmt,args...) +#endif /* VZMON_DEBUG */ + +struct tty_driver; +struct task_struct; +struct new_utsname; +struct file_system_type; +struct icmp_mib; +struct ip_mib; +struct tcp_mib; +struct udp_mib; +struct linux_mib; +struct fib_info; +struct fib_rule; +struct veip_struct; +struct ve_monitor; +struct nsproxy; + +#if defined(CONFIG_VE) && defined(CONFIG_INET) +struct fib_table; +#ifdef CONFIG_VE_IPTABLES +struct xt_table; +struct nf_conn; + +#define FRAG6Q_HASHSZ 64 + +struct ve_nf_conntrack { + struct hlist_head *_bysource; + struct nf_nat_protocol **_nf_nat_protos; + int _nf_nat_vmalloced; + struct xt_table *_nf_nat_table; + struct nf_conntrack_l3proto *_nf_nat_l3proto; + atomic_t _nf_conntrack_count; + int _nf_conntrack_max; + struct hlist_head *_nf_conntrack_hash; + int _nf_conntrack_checksum; + int _nf_conntrack_vmalloc; + struct hlist_head _unconfirmed; + struct hlist_head *_nf_ct_expect_hash; + unsigned int _nf_ct_expect_vmalloc; + unsigned int _nf_ct_expect_count; + unsigned int _nf_ct_expect_max; + struct hlist_head *_nf_ct_helper_hash; + unsigned int _nf_ct_helper_vmalloc; +#ifdef CONFIG_SYSCTL + /* l4 stuff: */ + unsigned long _nf_ct_icmp_timeout; + unsigned long _nf_ct_icmpv6_timeout; + unsigned int _nf_ct_udp_timeout; + unsigned int _nf_ct_udp_timeout_stream; + unsigned int _nf_ct_generic_timeout; + unsigned int _nf_ct_log_invalid; + unsigned int _nf_ct_tcp_timeout_max_retrans; + unsigned int _nf_ct_tcp_timeout_unacknowledged; + int _nf_ct_tcp_be_liberal; + int _nf_ct_tcp_loose; + int _nf_ct_tcp_max_retrans; + unsigned int _nf_ct_tcp_timeouts[10]; + struct ctl_table_header *_icmp_sysctl_header; + unsigned int _tcp_sysctl_table_users; + struct ctl_table_header *_tcp_sysctl_header; + unsigned int _udp_sysctl_table_users; + struct ctl_table_header *_udp_sysctl_header; + struct ctl_table_header *_icmpv6_sysctl_header; + struct ctl_table_header *_generic_sysctl_header; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + struct ctl_table_header *_icmp_compat_sysctl_header; + struct ctl_table_header *_tcp_compat_sysctl_header; + struct ctl_table_header *_udp_compat_sysctl_header; + struct ctl_table_header *_generic_compat_sysctl_header; +#endif + /* l4 protocols sysctl tables: */ + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmp; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp4; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmpv6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp4; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_generic; + struct nf_conntrack_l4proto **_nf_ct_protos[PF_MAX]; + /* l3 protocols sysctl tables: */ + struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv4; + struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv6; + struct nf_conntrack_l3proto *_nf_ct_l3protos[AF_MAX]; + /* sysctl standalone stuff: */ + struct ctl_table_header *_nf_ct_sysctl_header; + ctl_table *_nf_ct_sysctl_table; + ctl_table *_nf_ct_netfilter_table; + ctl_table *_nf_ct_net_table; + ctl_table *_ip_ct_netfilter_table; + struct ctl_table_header *_ip_ct_sysctl_header; + int _nf_ct_log_invalid_proto_min; + int _nf_ct_log_invalid_proto_max; +#endif /* CONFIG_SYSCTL */ +}; +#endif +#endif + +struct ve_cpu_stats { + cycles_t idle_time; + cycles_t iowait_time; + cycles_t strt_idle_time; + cycles_t used_time; + seqcount_t stat_lock; + unsigned long nr_running; + unsigned long nr_unint; + unsigned long nr_iowait; + cputime64_t user; + cputime64_t nice; + cputime64_t system; +} ____cacheline_aligned; + +struct ve_ipt_recent; +struct ve_xt_hashlimit; +struct svc_rqst; + +struct cgroup; +struct css_set; + +struct ve_struct { + struct list_head ve_list; + + envid_t veid; + struct list_head vetask_lh; + /* capability bounding set */ + kernel_cap_t ve_cap_bset; + atomic_t pcounter; + /* ref counter to ve from ipc */ + atomic_t counter; + unsigned int class_id; + struct rw_semaphore op_sem; + int is_running; + int is_locked; + atomic_t suspend; + unsigned long flags; + /* see vzcalluser.h for VE_FEATURE_XXX definitions */ + __u64 features; + +/* VE's root */ + struct path root_path; + + struct file_system_type *proc_fstype; + struct vfsmount *proc_mnt; + struct proc_dir_entry *proc_root; + +/* BSD pty's */ +#ifdef CONFIG_LEGACY_PTYS + struct tty_driver *pty_driver; + struct tty_driver *pty_slave_driver; +#endif +#ifdef CONFIG_UNIX98_PTYS + struct tty_driver *ptm_driver; + struct tty_driver *pts_driver; + struct ida *allocated_ptys; + struct file_system_type *devpts_fstype; + struct vfsmount *devpts_mnt; + struct dentry *devpts_root; + struct devpts_config *devpts_config; +#endif + + struct ve_nfs_context *nfs_context; + + struct file_system_type *shmem_fstype; + struct vfsmount *shmem_mnt; +#ifdef CONFIG_SYSFS + struct file_system_type *sysfs_fstype; + struct vfsmount *sysfs_mnt; + struct super_block *sysfs_sb; + struct sysfs_dirent *_sysfs_root; +#endif + struct kobject *_virtual_dir; + struct kset *class_kset; + struct kset *devices_kset; + struct kobject *dev_kobj; + struct kobject *dev_char_kobj; + struct kobject *dev_block_kobj; + struct class *tty_class; + struct class *mem_class; + +#ifdef CONFIG_NET + struct class *net_class; +#ifdef CONFIG_INET + unsigned long rt_flush_required; +#endif +#endif +#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE) + struct veip_struct *veip; + struct net_device *_venet_dev; +#endif + +/* per VE CPU stats*/ + struct timespec start_timespec; + u64 start_jiffies; /* Deprecated */ + cycles_t start_cycles; + unsigned long avenrun[3]; /* loadavg data */ + + cycles_t cpu_used_ve; + struct kstat_lat_pcpu_struct sched_lat_ve; + +#ifdef CONFIG_INET + struct venet_stat *stat; +#ifdef CONFIG_VE_IPTABLES +/* core/netfilter.c virtualization */ + struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */ + struct xt_table *_ve_ip6t_filter_pf; + struct xt_table *_ipt_mangle_table; + struct xt_table *_ip6t_mangle_table; + struct list_head _xt_tables[NPROTO]; + + __u64 ipt_mask; + __u64 _iptables_modules; + struct ve_nf_conntrack *_nf_conntrack; + struct ve_ipt_recent *_ipt_recent; + struct ve_xt_hashlimit *_xt_hashlimit; +#endif /* CONFIG_VE_IPTABLES */ +#endif + wait_queue_head_t *_log_wait; + unsigned *_log_start; + unsigned *_log_end; + unsigned *_logged_chars; + char *log_buf; +#define VE_DEFAULT_LOG_BUF_LEN 4096 + + struct ve_cpu_stats *cpu_stats; + unsigned long down_at; + struct list_head cleanup_list; +#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) + struct list_head _fuse_conn_list; + struct super_block *_fuse_control_sb; + + struct file_system_type *fuse_fs_type; + struct file_system_type *fuse_ctl_fs_type; +#endif + unsigned long jiffies_fixup; + unsigned char disable_net; + struct ve_monitor *monitor; + struct proc_dir_entry *monitor_proc; + unsigned long meminfo_val; + int _randomize_va_space; + +#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) \ + || defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) + unsigned int _nlmsvc_users; + struct task_struct* _nlmsvc_task; + unsigned long _nlmsvc_grace_period; + unsigned long _nlmsvc_timeout; + struct svc_rqst* _nlmsvc_rqst; +#endif + +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) + struct file_system_type *bm_fs_type; + struct vfsmount *bm_mnt; + int bm_enabled; + int bm_entry_count; + struct list_head bm_entries; +#endif + atomic_t locks_in_grace; + + struct nsproxy *ve_ns; + struct user_namespace *user_ns; + struct net *ve_netns; + struct cgroup *ve_cgroup; + struct css_set *ve_css_set; +}; + +#define VE_MEMINFO_DEFAULT 1 /* default behaviour */ +#define VE_MEMINFO_SYSTEM 0 /* disable meminfo virtualization */ + +enum { + VE_REBOOT, +}; + +int init_ve_cgroups(struct ve_struct *ve); +void fini_ve_cgroups(struct ve_struct *ve); + +extern struct ve_cpu_stats static_ve_cpu_stats; +static inline struct ve_cpu_stats *VE_CPU_STATS(struct ve_struct *ve, int cpu) +{ + return per_cpu_ptr(ve->cpu_stats, cpu); +} + +extern int nr_ve; +extern struct proc_dir_entry *proc_vz_dir; +extern struct proc_dir_entry *glob_proc_vz_dir; + +#ifdef CONFIG_VE + +void do_update_load_avg_ve(void); +void do_env_free(struct ve_struct *ptr); + +static inline struct ve_struct *get_ve(struct ve_struct *ptr) +{ + if (ptr != NULL) + atomic_inc(&ptr->counter); + return ptr; +} + +static inline void put_ve(struct ve_struct *ptr) +{ + if (ptr && atomic_dec_and_test(&ptr->counter)) + do_env_free(ptr); +} + +static inline void pget_ve(struct ve_struct *ptr) +{ + atomic_inc(&ptr->pcounter); +} + +void ve_cleanup_schedule(struct ve_struct *); +static inline void pput_ve(struct ve_struct *ptr) +{ + if (unlikely(atomic_dec_and_test(&ptr->pcounter))) + ve_cleanup_schedule(ptr); +} + +extern spinlock_t ve_cleanup_lock; +extern struct list_head ve_cleanup_list; +extern struct task_struct *ve_cleanup_thread; + +extern int (*do_ve_enter_hook)(struct ve_struct *ve, unsigned int flags); +extern void (*do_env_free_hook)(struct ve_struct *ve); + +extern unsigned long long ve_relative_clock(struct timespec * ts); + +#ifdef CONFIG_FAIRSCHED +#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask) +#else +#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0) +#endif +#else /* CONFIG_VE */ +#define ve_utsname system_utsname +#define get_ve(ve) (NULL) +#define put_ve(ve) do { } while (0) +#define pget_ve(ve) do { } while (0) +#define pput_ve(ve) do { } while (0) +#endif /* CONFIG_VE */ + +#endif /* _LINUX_VE_H */ diff -urNp linux-2.6.32.48/include/linux/veip.h linux-2.6.32.48-openvz/include/linux/veip.h --- linux-2.6.32.48/include/linux/veip.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/veip.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,15 @@ +#ifndef __VE_IP_H_ +#define __VE_IP_H_ + +struct ve_addr_struct { + int family; + __u32 key[4]; +}; + +struct sockaddr; + +extern void veaddr_print(char *, int, struct ve_addr_struct *); +extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, + struct ve_addr_struct *veaddr); + +#endif diff -urNp linux-2.6.32.48/include/linux/venet.h linux-2.6.32.48-openvz/include/linux/venet.h --- linux-2.6.32.48/include/linux/venet.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/venet.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,95 @@ +/* + * include/linux/venet.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VENET_H +#define _VENET_H + +#include +#include +#include +#include +#include + +#define VEIP_HASH_SZ 512 + +struct ve_struct; +struct venet_stat; +struct venet_stats { + struct net_device_stats stats; + struct net_device_stats *real_stats; +}; + +struct ip_entry_struct +{ + struct ve_addr_struct addr; + struct ve_struct *active_env; + struct venet_stat *stat; + struct veip_struct *veip; + struct list_head ip_hash; + struct list_head ve_list; +}; + +struct ext_entry_struct +{ + struct list_head list; + struct ve_addr_struct addr; +}; + +struct veip_struct +{ + struct list_head src_lh; + struct list_head dst_lh; + struct list_head ip_lh; + struct list_head list; + struct list_head ext_lh; + envid_t veid; +}; + +static inline struct net_device_stats * +venet_stats(struct net_device *dev, int cpu) +{ + struct venet_stats *stats; + stats = (struct venet_stats*)dev->ml_priv; + return per_cpu_ptr(stats->real_stats, cpu); +} + +/* veip_hash_lock should be taken for write by caller */ +void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip); +/* veip_hash_lock should be taken for write by caller */ +void ip_entry_unhash(struct ip_entry_struct *entry); +/* veip_hash_lock should be taken for read by caller */ +struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *); + +/* veip_hash_lock should be taken for read by caller */ +struct veip_struct *veip_find(envid_t veid); +/* veip_hash_lock should be taken for write by caller */ +struct veip_struct *veip_findcreate(envid_t veid); +/* veip_hash_lock should be taken for write by caller */ +void veip_put(struct veip_struct *veip); + +extern struct list_head veip_lh; + +int veip_start(struct ve_struct *ve); +void veip_stop(struct ve_struct *ve); +__exit void veip_cleanup(void); +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr); +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr); +int venet_change_skb_owner(struct sk_buff *skb); +struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve, + struct ve_addr_struct *addr); + +extern struct list_head ip_entry_hash_table[]; +extern rwlock_t veip_hash_lock; + +#ifdef CONFIG_PROC_FS +int veip_seq_show(struct seq_file *m, void *v); +#endif + +#endif diff -urNp linux-2.6.32.48/include/linux/ve_nfs.h linux-2.6.32.48-openvz/include/linux/ve_nfs.h --- linux-2.6.32.48/include/linux/ve_nfs.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/ve_nfs.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,30 @@ +/* + * linux/include/ve_nfs.h + * + * VE context for NFS + * + * Copyright (C) 2007 SWsoft + */ + +#ifndef __VE_NFS_H__ +#define __VE_NFS_H__ + +#ifdef CONFIG_VE + +#include + +#define NFS_CTX_FIELD(arg) (get_exec_env()->_##arg) + +#else /* CONFIG_VE */ + +#define NFS_CTX_FIELD(arg) _##arg + +#endif /* CONFIG_VE */ + +#define nlmsvc_grace_period NFS_CTX_FIELD(nlmsvc_grace_period) +#define nlmsvc_timeout NFS_CTX_FIELD(nlmsvc_timeout) +#define nlmsvc_users NFS_CTX_FIELD(nlmsvc_users) +#define nlmsvc_task NFS_CTX_FIELD(nlmsvc_task) +#define nlmsvc_rqst NFS_CTX_FIELD(nlmsvc_rqst) + +#endif diff -urNp linux-2.6.32.48/include/linux/veprintk.h linux-2.6.32.48-openvz/include/linux/veprintk.h --- linux-2.6.32.48/include/linux/veprintk.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/veprintk.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,38 @@ +/* + * include/linux/veprintk.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_PRINTK_H__ +#define __VE_PRINTK_H__ + +#ifdef CONFIG_VE + +#define ve_log_wait (*(get_exec_env()->_log_wait)) +#define ve_log_start (*(get_exec_env()->_log_start)) +#define ve_log_end (*(get_exec_env()->_log_end)) +#define ve_logged_chars (*(get_exec_env()->_logged_chars)) +#define ve_log_buf (get_exec_env()->log_buf) +#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \ + log_buf_len : VE_DEFAULT_LOG_BUF_LEN) +#define VE_LOG_BUF_MASK (ve_log_buf_len - 1) +#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK]) + +#else + +#define ve_log_wait log_wait +#define ve_log_start log_start +#define ve_log_end log_end +#define ve_logged_chars logged_chars +#define ve_log_buf log_buf +#define ve_log_buf_len log_buf_len +#define VE_LOG_BUF_MASK LOG_BUF_MASK +#define VE_LOG_BUF(idx) LOG_BUF(idx) + +#endif /* CONFIG_VE */ +#endif /* __VE_PRINTK_H__ */ diff -urNp linux-2.6.32.48/include/linux/ve_proto.h linux-2.6.32.48-openvz/include/linux/ve_proto.h --- linux-2.6.32.48/include/linux/ve_proto.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/ve_proto.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,101 @@ +/* + * include/linux/ve_proto.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_H__ +#define __VE_H__ + +#ifdef CONFIG_VE + +struct ve_struct; + +struct seq_file; + +typedef void (*ve_seq_print_t)(struct seq_file *, struct ve_struct *); + +void vzmon_register_veaddr_print_cb(ve_seq_print_t); +void vzmon_unregister_veaddr_print_cb(ve_seq_print_t); + +#ifdef CONFIG_INET +void tcp_v4_kill_ve_sockets(struct ve_struct *envid); +#ifdef CONFIG_VE_NETDEV +int venet_init(void); +#endif +#endif + +#define VE_IOPRIO_MIN 0 +#define VE_IOPRIO_MAX 8 +extern int ve_set_ioprio(int veid, int ioprio); + +extern struct list_head ve_list_head; +#define for_each_ve(ve) list_for_each_entry((ve), &ve_list_head, ve_list) +extern rwlock_t ve_list_lock; +extern struct ve_struct *get_ve_by_id(envid_t); +extern struct ve_struct *__find_ve_by_id(envid_t); + +struct env_create_param3; +extern int real_env_create(envid_t veid, unsigned flags, u32 class_id, + struct env_create_param3 *data, int datalen); +extern void ve_move_task(struct task_struct *, struct ve_struct *, struct cred *); + +int set_device_perms_ve(struct ve_struct *, unsigned, dev_t, unsigned); +int get_device_perms_ve(int dev_type, dev_t dev, int access_mode); +int devperms_seq_show(struct seq_file *m, void *v); + +enum { + VE_SS_CHAIN, + VE_INIT_EXIT_CHAIN, + + VE_MAX_CHAINS +}; + +typedef int ve_hook_init_fn(void *data); +typedef void ve_hook_fini_fn(void *data); + +struct ve_hook +{ + ve_hook_init_fn *init; + ve_hook_fini_fn *fini; + struct module *owner; + + /* Functions are called in ascending priority */ + int priority; + + /* Private part */ + struct list_head list; +}; + +enum { + HOOK_PRIO_DEFAULT = 0, + + HOOK_PRIO_FS = HOOK_PRIO_DEFAULT, + + HOOK_PRIO_NET_PRE, + HOOK_PRIO_NET, + HOOK_PRIO_NET_POST, + + HOOK_PRIO_AFTERALL = INT_MAX +}; + +void *ve_seq_start(struct seq_file *m, loff_t *pos); +void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos); +void ve_seq_stop(struct seq_file *m, void *v); + +extern int ve_hook_iterate_init(int chain, void *data); +extern void ve_hook_iterate_fini(int chain, void *data); + +extern void ve_hook_register(int chain, struct ve_hook *vh); +extern void ve_hook_unregister(struct ve_hook *vh); +#else /* CONFIG_VE */ +#define ve_hook_register(ch, vh) do { } while (0) +#define ve_hook_unregister(ve) do { } while (0) + +#define get_device_perms_ve(t, d, a) (0) +#endif /* CONFIG_VE */ +#endif diff -urNp linux-2.6.32.48/include/linux/ve_task.h linux-2.6.32.48-openvz/include/linux/ve_task.h --- linux-2.6.32.48/include/linux/ve_task.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/ve_task.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,73 @@ +/* + * include/linux/ve_task.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_TASK_H__ +#define __VE_TASK_H__ + +#include +#include + +struct ve_task_info { +/* virtualization */ + struct ve_struct *owner_env; + struct ve_struct *exec_env; + struct ve_struct *saved_env; + struct list_head vetask_list; + struct dentry *glob_proc_dentry; +/* statistics: scheduling latency */ + cycles_t sleep_time; + cycles_t sched_time; + cycles_t sleep_stamp; + cycles_t wakeup_stamp; + seqcount_t wakeup_lock; +}; + +#define VE_TASK_INFO(task) (&(task)->ve_task_info) +#define VE_TASK_LIST_2_TASK(lh) \ + list_entry(lh, struct task_struct, ve_task_info.vetask_list) + +#ifdef CONFIG_VE +extern struct ve_struct ve0; +#define get_ve0() (&ve0) + +#define ve_save_context(t) do { \ + t->ve_task_info.saved_env = \ + t->ve_task_info.exec_env; \ + t->ve_task_info.exec_env = get_ve0(); \ + } while (0) +#define ve_restore_context(t) do { \ + t->ve_task_info.exec_env = \ + t->ve_task_info.saved_env; \ + } while (0) + +#define get_exec_env() (current->ve_task_info.exec_env) +#define set_exec_env(ve) ({ \ + struct ve_task_info *vi; \ + struct ve_struct *old, *new; \ + \ + vi = ¤t->ve_task_info; \ + old = vi->exec_env; \ + new = ve; \ + if (unlikely(new == NULL)) { \ + printk("%s: NULL exec env (%s)\n", __func__, #ve);\ + new = get_ve0(); \ + } \ + vi->exec_env = new; \ + old; \ + }) +#else +#define get_ve0() (NULL) +#define get_exec_env() (NULL) +#define set_exec_env(new_env) (NULL) +#define ve_save_context(t) do { } while (0) +#define ve_restore_context(t) do { } while (0) +#endif + +#endif /* __VE_TASK_H__ */ diff -urNp linux-2.6.32.48/include/linux/veth.h linux-2.6.32.48-openvz/include/linux/veth.h --- linux-2.6.32.48/include/linux/veth.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/veth.h 2011-11-21 17:40:47.000000000 -0500 @@ -1,3 +1,12 @@ +/* + * include/linux/veth.h + * + * Copyright (C) 2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ #ifndef __NET_VETH_H_ #define __NET_VETH_H_ @@ -9,4 +18,29 @@ enum { #define VETH_INFO_MAX (__VETH_INFO_MAX - 1) }; +#ifdef __KERNEL__ +struct veth_struct +{ + struct net_device_stats stats; + struct net_device *me; + struct net_device *pair; + struct list_head hwaddr_list; + struct net_device_stats *real_stats; + int allow_mac_change; +}; + +#define veth_from_netdev(dev) \ + ((struct veth_struct *)(netdev_priv(dev))) +static inline struct net_device * veth_to_netdev(struct veth_struct *veth) +{ + return veth->me; +} +#endif + +static inline struct net_device_stats * +veth_stats(struct net_device *dev, int cpuid) +{ + return per_cpu_ptr(veth_from_netdev(dev)->real_stats, cpuid); +} + #endif diff -urNp linux-2.6.32.48/include/linux/virtinfo.h linux-2.6.32.48-openvz/include/linux/virtinfo.h --- linux-2.6.32.48/include/linux/virtinfo.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/virtinfo.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,100 @@ +/* + * include/linux/virtinfo.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_VIRTINFO_H +#define __LINUX_VIRTINFO_H + +#include +#include +#include + +struct vnotifier_block +{ + int (*notifier_call)(struct vnotifier_block *self, + unsigned long, void *, int); + struct vnotifier_block *next; + int priority; +}; + +extern struct semaphore virtinfo_sem; +void __virtinfo_notifier_register(int type, struct vnotifier_block *nb); +void virtinfo_notifier_register(int type, struct vnotifier_block *nb); +void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb); +int virtinfo_notifier_call(int type, unsigned long n, void *data); + +struct page_info { + unsigned long nr_file_dirty; + unsigned long nr_writeback; + unsigned long nr_anon_pages; + unsigned long nr_file_mapped; + unsigned long nr_slab_rec; + unsigned long nr_slab_unrec; + unsigned long nr_pagetable; + unsigned long nr_unstable_nfs; + unsigned long nr_bounce; + unsigned long nr_writeback_temp; +}; + +struct meminfo { + struct sysinfo si; + struct page_info pi; + unsigned long active, inactive; + unsigned long cache, swapcache; + unsigned long committed_space; + unsigned long allowed; + unsigned long vmalloc_total, vmalloc_used, vmalloc_largest; +}; + +#define VIRTINFO_MEMINFO 0 +#define VIRTINFO_ENOUGHMEM 1 +#define VIRTINFO_DOFORK 2 +#define VIRTINFO_DOEXIT 3 +#define VIRTINFO_DOEXECVE 4 +#define VIRTINFO_DOFORKRET 5 +#define VIRTINFO_DOFORKPOST 6 +#define VIRTINFO_EXIT 7 +#define VIRTINFO_EXITMMAP 8 +#define VIRTINFO_EXECMMAP 9 +#define VIRTINFO_OUTOFMEM 10 +#define VIRTINFO_PAGEIN 11 +#define VIRTINFO_SYSINFO 12 +#define VIRTINFO_NEWUBC 13 +#define VIRTINFO_VMSTAT 14 + +enum virt_info_types { + VITYPE_GENERAL, + VITYPE_FAUDIT, + VITYPE_QUOTA, + VITYPE_SCP, + + VIRT_TYPES +}; + +#ifdef CONFIG_VZ_GENCALLS + +static inline int virtinfo_gencall(unsigned long n, void *data) +{ + int r; + + r = virtinfo_notifier_call(VITYPE_GENERAL, n, data); + if (r & NOTIFY_FAIL) + return -ENOBUFS; + if (r & NOTIFY_OK) + return -ERESTARTNOINTR; + return 0; +} + +#else + +#define virtinfo_gencall(n, data) 0 + +#endif + +#endif /* __LINUX_VIRTINFO_H */ diff -urNp linux-2.6.32.48/include/linux/virtinfoscp.h linux-2.6.32.48-openvz/include/linux/virtinfoscp.h --- linux-2.6.32.48/include/linux/virtinfoscp.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/virtinfoscp.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,23 @@ +#ifndef __VIRTINFO_SCP_H__ +#define __VIRTINFO_SCP_H__ + +/* + * Dump and restore operations are non-symmetric. + * With respect to finish/fail hooks, 2 dump hooks are called from + * different proc operations, but restore hooks are called from a single one. + */ +#define VIRTINFO_SCP_COLLECT 0x10 +#define VIRTINFO_SCP_DUMP 0x11 +#define VIRTINFO_SCP_DMPFIN 0x12 +#define VIRTINFO_SCP_RSTCHECK 0x13 +#define VIRTINFO_SCP_RESTORE 0x14 +#define VIRTINFO_SCP_RSTFAIL 0x15 + +#define VIRTINFO_SCP_RSTTSK 0x20 +#define VIRTINFO_SCP_RSTMM 0x21 + +#define VIRTINFO_SCP_TEST 0x30 + +#define VIRTNOTIFY_CHANGE 0x100 + +#endif /* __VIRTINFO_SCP_H__ */ diff -urNp linux-2.6.32.48/include/linux/vmalloc.h linux-2.6.32.48-openvz/include/linux/vmalloc.h --- linux-2.6.32.48/include/linux/vmalloc.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vmalloc.h 2011-11-21 17:40:47.000000000 -0500 @@ -23,6 +23,10 @@ struct vm_area_struct; /* vma defining #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ #endif +/* align size to 2^n page boundary */ +#define POWER2_PAGE_ALIGN(size) \ + ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size)))) + struct vm_struct { struct vm_struct *next; void *addr; @@ -51,12 +55,16 @@ static inline void vmalloc_init(void) #endif extern void *vmalloc(unsigned long size); +extern void *ub_vmalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); +extern void *ub_vmalloc_node(unsigned long size, int node); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *vmalloc_best(unsigned long size); +extern void *ub_vmalloc_best(unsigned long size); extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); extern void vfree(const void *addr); @@ -68,6 +76,7 @@ extern void vunmap(const void *addr); extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); void vmalloc_sync_all(void); +extern void vprintstat(void); /* * Lowlevel-APIs (not for driver use!) diff -urNp linux-2.6.32.48/include/linux/vmstat.h linux-2.6.32.48-openvz/include/linux/vmstat.h --- linux-2.6.32.48/include/linux/vmstat.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vmstat.h 2011-11-21 17:40:47.000000000 -0500 @@ -105,6 +105,7 @@ static inline void vm_events_fold_cpu(in } #endif +extern unsigned long vm_events(enum vm_event_item i); #else /* Disable counters */ @@ -127,6 +128,7 @@ static inline void vm_events_fold_cpu(in { } +static inline unsigned long vm_events(enum vm_event_item i) { return 0; } #endif /* CONFIG_VM_EVENT_COUNTERS */ #define __count_zone_vm_events(item, zone, delta) \ diff -urNp linux-2.6.32.48/include/linux/vzcalluser.h linux-2.6.32.48-openvz/include/linux/vzcalluser.h --- linux-2.6.32.48/include/linux/vzcalluser.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzcalluser.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,202 @@ +/* + * include/linux/vzcalluser.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VZCALLUSER_H +#define _LINUX_VZCALLUSER_H + +#include +#include +#include + +#define KERN_VZ_PRIV_RANGE 51 + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +#ifndef __KERNEL__ +#define __user +#endif + +/* + * VE management ioctls + */ + +struct vzctl_old_env_create { + envid_t veid; + unsigned flags; +#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */ +#define VE_EXCLUSIVE 2 /* Fail if exists */ +#define VE_ENTER 4 /* Enter existing VE */ +#define VE_TEST 8 /* Test if VE exists */ +#define VE_LOCK 16 /* Do not allow entering created VE */ +#define VE_SKIPLOCK 32 /* Allow entering embrion VE */ + __u32 addr; +}; + +struct vzctl_mark_env_to_down { + envid_t veid; +}; + +struct vzctl_setdevperms { + envid_t veid; + unsigned type; +#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */ +#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */ +#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */ + unsigned dev; + unsigned mask; +}; + +struct vzctl_ve_netdev { + envid_t veid; + int op; +#define VE_NETDEV_ADD 1 +#define VE_NETDEV_DEL 2 + char __user *dev_name; +}; + +struct vzctl_ve_meminfo { + envid_t veid; + unsigned long val; +}; + +struct vzctl_env_create_cid { + envid_t veid; + unsigned flags; + __u32 class_id; +}; + +struct vzctl_env_create { + envid_t veid; + unsigned flags; + __u32 class_id; +}; + +struct env_create_param { + __u64 iptables_mask; +}; + +#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param) + +struct env_create_param2 { + __u64 iptables_mask; + __u64 feature_mask; + __u32 total_vcpus; /* 0 - don't care, same as in host */ +}; + +struct env_create_param3 { + __u64 iptables_mask; + __u64 feature_mask; + __u32 total_vcpus; + __u32 pad; + __u64 known_features; +}; + +#define VE_FEATURE_SYSFS (1ULL << 0) +#define VE_FEATURE_NFS (1ULL << 1) +#define VE_FEATURE_DEF_PERMS (1ULL << 2) +#define VE_FEATURE_SIT (1ULL << 3) +#define VE_FEATURE_IPIP (1ULL << 4) +#define VE_FEATURE_PPP (1ULL << 5) +#define VE_FEATURE_IPGRE (1ULL << 6) +#define VE_FEATURE_BRIDGE (1ULL << 7) +#define VE_FEATURE_NFSD (1ULL << 8) + +#define VE_FEATURES_OLD (VE_FEATURE_SYSFS) +#define VE_FEATURES_DEF (VE_FEATURE_SYSFS | \ + VE_FEATURE_DEF_PERMS) + +typedef struct env_create_param3 env_create_param_t; +#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(env_create_param_t) + +struct vzctl_env_create_data { + envid_t veid; + unsigned flags; + __u32 class_id; + env_create_param_t __user *data; + int datalen; +}; + +struct vz_load_avg { + int val_int; + int val_frac; +}; + +struct vz_cpu_stat { + unsigned long user_jif; + unsigned long nice_jif; + unsigned long system_jif; + unsigned long uptime_jif; + __u64 idle_clk; + __u64 strv_clk; + __u64 uptime_clk; + struct vz_load_avg avenrun[3]; /* loadavg data */ +}; + +struct vzctl_cpustatctl { + envid_t veid; + struct vz_cpu_stat __user *cpustat; +}; + +#define VZCTLTYPE '.' +#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \ + struct vzctl_old_env_create) +#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \ + struct vzctl_mark_env_to_down) +#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \ + struct vzctl_setdevperms) +#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \ + struct vzctl_env_create_cid) +#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \ + struct vzctl_env_create) +#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \ + struct vzctl_cpustatctl) +#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ + struct vzctl_env_create_data) +#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \ + struct vzctl_ve_netdev) +#define VZCTL_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ + struct vzctl_ve_meminfo) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +#include + +struct compat_vzctl_ve_netdev { + envid_t veid; + int op; + compat_uptr_t dev_name; +}; + +struct compat_vzctl_ve_meminfo { + envid_t veid; + compat_ulong_t val; +}; + +struct compat_vzctl_env_create_data { + envid_t veid; + unsigned flags; + __u32 class_id; + compat_uptr_t data; + int datalen; +}; + +#define VZCTL_COMPAT_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ + struct compat_vzctl_env_create_data) +#define VZCTL_COMPAT_VE_NETDEV _IOW(VZCTLTYPE, 11, \ + struct compat_vzctl_ve_netdev) +#define VZCTL_COMPAT_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ + struct compat_vzctl_ve_meminfo) +#endif +#endif + +#endif diff -urNp linux-2.6.32.48/include/linux/vzctl.h linux-2.6.32.48-openvz/include/linux/vzctl.h --- linux-2.6.32.48/include/linux/vzctl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzctl.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,30 @@ +/* + * include/linux/vzctl.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VZCTL_H +#define _LINUX_VZCTL_H + +#include + +struct module; +struct inode; +struct file; +struct vzioctlinfo { + unsigned type; + int (*ioctl)(struct file *, unsigned int, unsigned long); + int (*compat_ioctl)(struct file *, unsigned int, unsigned long); + struct module *owner; + struct list_head list; +}; + +extern void vzioctl_register(struct vzioctlinfo *inf); +extern void vzioctl_unregister(struct vzioctlinfo *inf); + +#endif diff -urNp linux-2.6.32.48/include/linux/vzctl_quota.h linux-2.6.32.48-openvz/include/linux/vzctl_quota.h --- linux-2.6.32.48/include/linux/vzctl_quota.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzctl_quota.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,74 @@ +/* + * include/linux/vzctl_quota.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_VZCTL_QUOTA_H__ +#define __LINUX_VZCTL_QUOTA_H__ + +#include + +#ifndef __KERNEL__ +#define __user +#endif + +/* + * Quota management ioctl + */ + +struct vz_quota_stat; +struct vzctl_quotactl { + int cmd; + unsigned int quota_id; + struct vz_quota_stat __user *qstat; + char __user *ve_root; +}; + +struct vzctl_quotaugidctl { + int cmd; /* subcommand */ + unsigned int quota_id; /* quota id where it applies to */ + unsigned int ugid_index;/* for reading statistic. index of first + uid/gid record to read */ + unsigned int ugid_size; /* size of ugid_buf array */ + void *addr; /* user-level buffer */ +}; + +#define VZDQCTLTYPE '+' +#define VZCTL_QUOTA_DEPR_CTL _IOWR(VZDQCTLTYPE, 1, \ + struct vzctl_quotactl) +#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \ + struct vzctl_quotactl) +#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ + struct vzctl_quotaugidctl) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +struct compat_vzctl_quotactl { + int cmd; + unsigned int quota_id; + compat_uptr_t qstat; + compat_uptr_t ve_root; +}; + +struct compat_vzctl_quotaugidctl { + int cmd; /* subcommand */ + unsigned int quota_id; /* quota id where it applies to */ + unsigned int ugid_index;/* for reading statistic. index of first + uid/gid record to read */ + unsigned int ugid_size; /* size of ugid_buf array */ + compat_uptr_t addr; /* user-level buffer */ +}; + +#define VZCTL_COMPAT_QUOTA_CTL _IOWR(VZDQCTLTYPE, 2, \ + struct compat_vzctl_quotactl) +#define VZCTL_COMPAT_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ + struct compat_vzctl_quotaugidctl) +#endif +#endif + +#endif /* __LINUX_VZCTL_QUOTA_H__ */ diff -urNp linux-2.6.32.48/include/linux/vzctl_venet.h linux-2.6.32.48-openvz/include/linux/vzctl_venet.h --- linux-2.6.32.48/include/linux/vzctl_venet.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzctl_venet.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,53 @@ +/* + * include/linux/vzctl_venet.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZCTL_VENET_H +#define _VZCTL_VENET_H + +#include +#include +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +struct vzctl_ve_ip_map { + envid_t veid; + int op; +#define VE_IP_ADD 1 +#define VE_IP_DEL 2 +#define VE_IP_EXT_ADD 3 +#define VE_IP_EXT_DEL 4 + struct sockaddr *addr; + int addrlen; +}; + +#define VENETCTLTYPE '(' + +#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ + struct vzctl_ve_ip_map) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +struct compat_vzctl_ve_ip_map { + envid_t veid; + int op; + compat_uptr_t addr; + int addrlen; +}; + +#define VENETCTL_COMPAT_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ + struct compat_vzctl_ve_ip_map) +#endif +#endif + +#endif diff -urNp linux-2.6.32.48/include/linux/vzctl_veth.h linux-2.6.32.48-openvz/include/linux/vzctl_veth.h --- linux-2.6.32.48/include/linux/vzctl_veth.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzctl_veth.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,42 @@ +/* + * include/linux/vzctl_veth.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZCTL_VETH_H +#define _VZCTL_VETH_H + +#include +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +struct vzctl_ve_hwaddr { + envid_t veid; + int op; +#define VE_ETH_ADD 1 +#define VE_ETH_DEL 2 +#define VE_ETH_ALLOW_MAC_CHANGE 3 +#define VE_ETH_DENY_MAC_CHANGE 4 + unsigned char dev_addr[6]; + int addrlen; + char dev_name[16]; + unsigned char dev_addr_ve[6]; + int addrlen_ve; + char dev_name_ve[16]; +}; + +#define VETHCTLTYPE '[' + +#define VETHCTL_VE_HWADDR _IOW(VETHCTLTYPE, 3, \ + struct vzctl_ve_hwaddr) + +#endif diff -urNp linux-2.6.32.48/include/linux/vzdq_tree.h linux-2.6.32.48-openvz/include/linux/vzdq_tree.h --- linux-2.6.32.48/include/linux/vzdq_tree.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzdq_tree.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,99 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo disk quota tree definition + */ + +#ifndef _VZDQ_TREE_H +#define _VZDQ_TREE_H + +#include +#include + +typedef unsigned int quotaid_t; +#define QUOTAID_BITS 32 +#define QUOTAID_BBITS 4 +#define QUOTAID_EBITS 8 + +#if QUOTAID_EBITS % QUOTAID_BBITS +#error Quota bit assumption failure +#endif + +#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS) +#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1) +#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \ + / QUOTAID_BBITS) +#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \ + / QUOTAID_EBITS) +#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS) + +/* + * Depth of keeping unused node (not inclusive). + * 0 means release all nodes including root, + * QUOTATREE_DEPTH means never release nodes. + * Current value: release all nodes strictly after QUOTATREE_EDEPTH + * (measured in external shift units). + */ +#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \ + - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \ + + 1) + +/* + * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes. + * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS), + * and each node contains 2^QUOTAID_BBITS pointers. + * Level 0 is a (single) tree root node. + * + * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data. + * Nodes of lower levels contain pointers to nodes. + * + * Double pointer in array of i-level node, pointing to a (i+1)-level node + * (such as inside quotatree_find_state) are marked by level (i+1), not i. + * Level 0 double pointer is a pointer to root inside tree struct. + * + * The tree is permanent, i.e. all index blocks allocated are keeped alive to + * preserve the blocks numbers in the quota file tree to keep its changes + * locally. + */ +struct quotatree_node { + struct list_head list; + quotaid_t num; + void *blocks[QUOTATREE_BSIZE]; +}; + +struct quotatree_level { + struct list_head usedlh, freelh; + quotaid_t freenum; +}; + +struct quotatree_tree { + struct quotatree_level levels[QUOTATREE_DEPTH]; + struct quotatree_node *root; + unsigned int leaf_num; +}; + +struct quotatree_find_state { + void **block; + int level; +}; + +/* number of leafs (objects) and leaf level of the tree */ +#define QTREE_LEAFNUM(tree) ((tree)->leaf_num) +#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1]) + +struct quotatree_tree *quotatree_alloc(void); +void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st); +int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st, void *data); +void quotatree_remove(struct quotatree_tree *tree, quotaid_t id); +void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)); +void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id); +void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index); + +#endif /* _VZDQ_TREE_H */ + diff -urNp linux-2.6.32.48/include/linux/vzevent.h linux-2.6.32.48-openvz/include/linux/vzevent.h --- linux-2.6.32.48/include/linux/vzevent.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzevent.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,21 @@ +#ifndef __LINUX_VZ_EVENT_H__ +#define __LINUX_VZ_EVENT_H__ + +#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE) +extern int vzevent_send(int msg, const char *attrs_fmt, ...); +#else +static inline int vzevent_send(int msg, const char *attrs_fmt, ...) +{ + return 0; +} +#endif + +enum { + VE_EVENT_MOUNT, + VE_EVENT_UMOUNT, + VE_EVENT_START, + VE_EVENT_STOP, + VE_EVENT_REBOOT, +}; + +#endif /* __LINUX_VZ_EVENT_H__ */ diff -urNp linux-2.6.32.48/include/linux/vziptable_defs.h linux-2.6.32.48-openvz/include/linux/vziptable_defs.h --- linux-2.6.32.48/include/linux/vziptable_defs.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vziptable_defs.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,81 @@ +#ifndef _LINUX_VZIPTABLE_DEFS_H +#define _LINUX_VZIPTABLE_DEFS_H + +#include +#include + +/* + * This masks represent modules + * + * Strictly speaking we use only a small subset + * of this bits novadays but we MUST RESERVE all + * the bits were ever used in a sake of ABI compatibility + * (ie compatibility with vzctl user-space utility) + * + * DON'T EVER DELETE/MODIFY THESE BITS + */ +#define VE_IPT_GENERATE(name, shift) name = (1U << shift) + +enum ve_ipt_mods { + VE_IPT_GENERATE(VE_IP_IPTABLES_MOD, 0), + VE_IPT_GENERATE(VE_IP_FILTER_MOD, 1), + VE_IPT_GENERATE(VE_IP_MANGLE_MOD, 2), + VE_IPT_GENERATE(VE_IP_MATCH_LIMIT_MOD, 3), + VE_IPT_GENERATE(VE_IP_MATCH_MULTIPORT_MOD, 4), + VE_IPT_GENERATE(VE_IP_MATCH_TOS_MOD, 5), + VE_IPT_GENERATE(VE_IP_TARGET_TOS_MOD, 6), + VE_IPT_GENERATE(VE_IP_TARGET_REJECT_MOD, 7), + VE_IPT_GENERATE(VE_IP_TARGET_TCPMSS_MOD, 8), + VE_IPT_GENERATE(VE_IP_MATCH_TCPMSS_MOD, 9), + VE_IPT_GENERATE(VE_IP_MATCH_TTL_MOD, 10), + VE_IPT_GENERATE(VE_IP_TARGET_LOG_MOD, 11), + VE_IPT_GENERATE(VE_IP_MATCH_LENGTH_MOD, 12), + VE_IPT_GENERATE(VE_IP_CONNTRACK_MOD, 14), + VE_IPT_GENERATE(VE_IP_CONNTRACK_FTP_MOD, 15), + VE_IPT_GENERATE(VE_IP_CONNTRACK_IRC_MOD, 16), + VE_IPT_GENERATE(VE_IP_MATCH_CONNTRACK_MOD, 17), + VE_IPT_GENERATE(VE_IP_MATCH_STATE_MOD, 18), + VE_IPT_GENERATE(VE_IP_MATCH_HELPER_MOD, 19), + VE_IPT_GENERATE(VE_IP_NAT_MOD, 20), + VE_IPT_GENERATE(VE_IP_NAT_FTP_MOD, 21), + VE_IPT_GENERATE(VE_IP_NAT_IRC_MOD, 22), + VE_IPT_GENERATE(VE_IP_TARGET_REDIRECT_MOD, 23), + VE_IPT_GENERATE(VE_IP_MATCH_OWNER_MOD, 24), + VE_IPT_GENERATE(VE_IP_MATCH_MAC_MOD, 25), + VE_IPT_GENERATE(VE_IP_IPTABLES6_MOD, 26), + VE_IPT_GENERATE(VE_IP_FILTER6_MOD, 27), + VE_IPT_GENERATE(VE_IP_MANGLE6_MOD, 28), + VE_IPT_GENERATE(VE_IP_IPTABLE_NAT_MOD, 29), + VE_IPT_GENERATE(VE_NF_CONNTRACK_MOD, 30), +}; + +/* these masks represent modules with their dependences */ +#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD) +#define VE_IP_FILTER (VE_IP_FILTER_MOD | VE_IP_IPTABLES) +#define VE_IP_MANGLE (VE_IP_MANGLE_MOD | VE_IP_IPTABLES) +#define VE_IP_IPTABLES6 (VE_IP_IPTABLES6_MOD) +#define VE_IP_FILTER6 (VE_IP_FILTER6_MOD | VE_IP_IPTABLES6) +#define VE_IP_MANGLE6 (VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6) +#define VE_NF_CONNTRACK (VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES) +#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD | VE_IP_IPTABLES) +#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD | VE_IP_CONNTRACK) +#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD | VE_IP_CONNTRACK) +#define VE_IP_NAT (VE_IP_NAT_MOD | VE_IP_CONNTRACK) +#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD | VE_IP_NAT | VE_IP_CONNTRACK_FTP) +#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD | VE_IP_NAT | VE_IP_CONNTRACK_IRC) +#define VE_IP_IPTABLE_NAT (VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK) + +/* safe iptables mask to be used by default */ +#define VE_IP_DEFAULT (VE_IP_IPTABLES | VE_IP_FILTER | VE_IP_MANGLE) + +/* allowed all */ +#define VE_IP_ALL (~0ULL) + +#define VE_IPT_CMP(x, y) (((x) & (y)) == (y)) + +static inline bool mask_ipt_allow(__u64 permitted, __u64 mask) +{ + return VE_IPT_CMP(permitted, mask); +} + +#endif /* _LINUX_VZIPTABLE_DEFS_H */ diff -urNp linux-2.6.32.48/include/linux/vzquota.h linux-2.6.32.48-openvz/include/linux/vzquota.h --- linux-2.6.32.48/include/linux/vzquota.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzquota.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,380 @@ +/* + * + * Copyright (C) 2001-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo disk quota implementation + */ + +#ifndef _VZDQUOTA_H +#define _VZDQUOTA_H + +#include +#include + +/* vzquotactl syscall commands */ +#define VZ_DQ_CREATE 5 /* create quota master block */ +#define VZ_DQ_DESTROY 6 /* destroy qmblk */ +#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */ +#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */ +#define VZ_DQ_SETLIMIT 9 /* set new limits */ +#define VZ_DQ_GETSTAT 10 /* get usage statistic */ +#define VZ_DQ_OFF_FORCED 11 /* forced off */ +/* set of syscalls to maintain UGID quotas */ +#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */ +#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */ +#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */ +#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */ +#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */ +#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */ +#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */ +#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */ + +/* common structure for vz and ugid quota */ +struct dq_stat { + /* blocks limits */ + __u64 bhardlimit; /* absolute limit in bytes */ + __u64 bsoftlimit; /* preferred limit in bytes */ + time_t btime; /* time limit for excessive disk use */ + __u64 bcurrent; /* current bytes count */ + /* inodes limits */ + __u32 ihardlimit; /* absolute limit on allocated inodes */ + __u32 isoftlimit; /* preferred inode limit */ + time_t itime; /* time limit for excessive inode use */ + __u32 icurrent; /* current # allocated inodes */ +}; + +/* One second resolution for grace times */ +#define CURRENT_TIME_SECONDS (get_seconds()) + +/* Values for dq_info->flags */ +#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ +#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ + +struct dq_info { + time_t bexpire; /* expire timeout for excessive disk use */ + time_t iexpire; /* expire timeout for excessive inode use */ + unsigned flags; /* see previos defines */ +}; + +struct vz_quota_stat { + struct dq_stat dq_stat; + struct dq_info dq_info; +}; + +/* UID/GID interface record - for user-kernel level exchange */ +struct vz_quota_iface { + unsigned int qi_id; /* UID/GID this applies to */ + unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ + struct dq_stat qi_stat; /* limits, options, usage stats */ +}; + +#ifdef CONFIG_COMPAT +#include +struct compat_dq_stat { + /* blocks limits */ + __u64 bhardlimit; /* absolute limit in bytes */ + __u64 bsoftlimit; /* preferred limit in bytes */ + compat_time_t btime; /* time limit for excessive disk use */ + __u64 bcurrent; /* current bytes count */ + /* inodes limits */ + __u32 ihardlimit; /* absolute limit on allocated inodes */ + __u32 isoftlimit; /* preferred inode limit */ + compat_time_t itime; /* time limit for excessive inode use */ + __u32 icurrent; /* current # allocated inodes */ +}; + +struct compat_dq_info { + compat_time_t bexpire; /* expire timeout for excessive disk use */ + compat_time_t iexpire; /* expire timeout for excessive inode use */ + unsigned flags; /* see previos defines */ +}; + +struct compat_vz_quota_stat { + struct compat_dq_stat dq_stat; + struct compat_dq_info dq_info; +}; + +struct compat_vz_quota_iface { + unsigned int qi_id; /* UID/GID this applies to */ + unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ + struct compat_dq_stat qi_stat; /* limits, options, usage stats */ +}; + +static inline void compat_dqstat2dqstat(struct compat_dq_stat *odqs, + struct dq_stat *dqs) +{ + dqs->bhardlimit = odqs->bhardlimit; + dqs->bsoftlimit = odqs->bsoftlimit; + dqs->bcurrent = odqs->bcurrent; + dqs->btime = odqs->btime; + + dqs->ihardlimit = odqs->ihardlimit; + dqs->isoftlimit = odqs->isoftlimit; + dqs->icurrent = odqs->icurrent; + dqs->itime = odqs->itime; +} + +static inline void compat_dqinfo2dqinfo(struct compat_dq_info *odqi, + struct dq_info *dqi) +{ + dqi->bexpire = odqi->bexpire; + dqi->iexpire = odqi->iexpire; + dqi->flags = odqi->flags; +} + +static inline void dqstat2compat_dqstat(struct dq_stat *dqs, + struct compat_dq_stat *odqs) +{ + odqs->bhardlimit = dqs->bhardlimit; + odqs->bsoftlimit = dqs->bsoftlimit; + odqs->bcurrent = dqs->bcurrent; + odqs->btime = (compat_time_t)dqs->btime; + + odqs->ihardlimit = dqs->ihardlimit; + odqs->isoftlimit = dqs->isoftlimit; + odqs->icurrent = dqs->icurrent; + odqs->itime = (compat_time_t)dqs->itime; +} + +static inline void dqinfo2compat_dqinfo(struct dq_info *dqi, + struct compat_dq_info *odqi) +{ + odqi->bexpire = (compat_time_t)dqi->bexpire; + odqi->iexpire = (compat_time_t)dqi->iexpire; + odqi->flags = dqi->flags; +} +#endif + +/* values for flags and dq_flags */ +/* this flag is set if the userspace has been unable to provide usage + * information about all ugids + * if the flag is set, we don't allocate new UG quota blocks (their + * current usage is unknown) or free existing UG quota blocks (not to + * lose information that this block is ok) */ +#define VZDQUG_FIXED_SET 0x01 +/* permit to use ugid quota */ +#define VZDQUG_ON 0x02 +#define VZDQ_USRQUOTA 0x10 +#define VZDQ_GRPQUOTA 0x20 +#define VZDQ_NOACT 0x1000 /* not actual */ +#define VZDQ_NOQUOT 0x2000 /* not under quota tree */ + +struct vz_quota_ugid_stat { + unsigned int limit; /* max amount of ugid records */ + unsigned int count; /* amount of ugid records */ + unsigned int flags; +}; + +struct vz_quota_ugid_setlimit { + unsigned int type; /* quota type (USR/GRP) */ + unsigned int id; /* ugid */ + struct if_dqblk dqb; /* limits info */ +}; + +struct vz_quota_ugid_setinfo { + unsigned int type; /* quota type (USR/GRP) */ + struct if_dqinfo dqi; /* grace info */ +}; + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include + +/* Values for dq_info flags */ +#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ +#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ + +/* values for dq_state */ +#define VZDQ_STARTING 0 /* created, not turned on yet */ +#define VZDQ_WORKING 1 /* quota created, turned on */ +#define VZDQ_STOPING 2 /* created, turned on and off */ + +/* master quota record - one per veid */ +struct vz_quota_master { + struct list_head dq_hash; /* next quota in hash list */ + atomic_t dq_count; /* inode reference count */ + unsigned int dq_flags; /* see VZDQUG_FIXED_SET */ + unsigned int dq_state; /* see values above */ + unsigned int dq_id; /* VEID this applies to */ + struct dq_stat dq_stat; /* limits, grace, usage stats */ + struct dq_info dq_info; /* grace times and flags */ + spinlock_t dq_data_lock; /* for dq_stat */ + + struct mutex dq_mutex; /* mutex to protect + ugid tree */ + + struct list_head dq_ilink_list; /* list of vz_quota_ilink */ + struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */ + struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */ + unsigned int dq_ugid_count; /* amount of ugid records */ + unsigned int dq_ugid_max; /* max amount of ugid records */ + struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */ + + struct path dq_root_path; /* path of fs tree */ + struct super_block *dq_sb; /* superblock of our quota root */ +}; + +/* UID/GID quota record - one per pair (quota_master, uid or gid) */ +struct vz_quota_ugid { + unsigned int qugid_id; /* UID/GID this applies to */ + struct dq_stat qugid_stat; /* limits, options, usage stats */ + int qugid_type; /* USRQUOTA|GRPQUOTA */ + atomic_t qugid_count; /* reference count */ +}; + +#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11) + +struct vz_quota_datast { + struct vz_quota_ilink qlnk; +}; + +#define VIRTINFO_QUOTA_GETSTAT 0 +#define VIRTINFO_QUOTA_ON 1 +#define VIRTINFO_QUOTA_OFF 2 +#define VIRTINFO_QUOTA_DISABLE 3 + +struct virt_info_quota { + struct super_block *super; + struct dq_stat *qstat; +}; + +/* + * Interface to VZ quota core + */ +#define INODE_QLNK(inode) (&(inode)->i_qlnk) +#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk) + +#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef) + +#define VZ_QUOTAO_SETE 1 +#define VZ_QUOTAO_INIT 2 +#define VZ_QUOTAO_DESTR 3 +#define VZ_QUOTAO_SWAP 4 +#define VZ_QUOTAO_INICAL 5 +#define VZ_QUOTAO_DRCAL 6 +#define VZ_QUOTAO_QSET 7 +#define VZ_QUOTAO_TRANS 8 +#define VZ_QUOTAO_ACT 9 +#define VZ_QUOTAO_DTREE 10 +#define VZ_QUOTAO_DET 11 +#define VZ_QUOTAO_ON 12 +#define VZ_QUOTAO_RE_LOCK 13 + +#define DQUOT_CMD_ALLOC 0 +#define DQUOT_CMD_PREALLOC 1 +#define DQUOT_CMD_CHECK 12 +#define DQUOT_CMD_FORCE 13 + +extern struct mutex vz_quota_mutex; + +void inode_qmblk_lock(struct super_block *sb); +void inode_qmblk_unlock(struct super_block *sb); +void qmblk_data_read_lock(struct vz_quota_master *qmblk); +void qmblk_data_read_unlock(struct vz_quota_master *qmblk); +void qmblk_data_write_lock(struct vz_quota_master *qmblk); +void qmblk_data_write_unlock(struct vz_quota_master *qmblk); + +/* for quota operations */ +void vzquota_inode_init_call(struct inode *inode); +void vzquota_inode_swap_call(struct inode *, struct inode *); +void vzquota_inode_drop_call(struct inode *inode); +int vzquota_inode_transfer_call(struct inode *, struct iattr *); +struct vz_quota_master *vzquota_inode_data(struct inode *inode, + struct vz_quota_datast *); +void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *); +int vzquota_rename_check(struct inode *inode, + struct inode *old_dir, struct inode *new_dir); +struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode); +/* for second-level quota */ +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); +/* for management operations */ +struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, + struct vz_quota_stat *qstat); +void vzquota_free_master(struct vz_quota_master *); +struct vz_quota_master *vzquota_find_master(unsigned int quota_id); +int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, + struct vz_quota_master *qmblk, char __user *buf); +int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk, + char __user *buf, int force); +int vzquota_get_super(struct super_block *sb); +void vzquota_put_super(struct super_block *sb); + +static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk) +{ + if (!atomic_read(&qmblk->dq_count)) + BUG(); + atomic_inc(&qmblk->dq_count); + return qmblk; +} + +static inline void __qmblk_put(struct vz_quota_master *qmblk) +{ + atomic_dec(&qmblk->dq_count); +} + +static inline void qmblk_put(struct vz_quota_master *qmblk) +{ + if (!atomic_dec_and_test(&qmblk->dq_count)) + return; + vzquota_free_master(qmblk); +} + +extern struct list_head vzquota_hash_table[]; +extern int vzquota_hash_size; + +/* + * Interface to VZ UGID quota + */ +extern struct quotactl_ops vz_quotactl_operations; +extern struct dquot_operations vz_quota_operations2; +extern struct quota_format_type vz_quota_empty_v2_format; + +#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \ + qmblk->dq_uid_tree : \ + qmblk->dq_gid_tree) + +#define VZDQUG_FIND_DONT_ALLOC 1 +#define VZDQUG_FIND_FAKE 2 +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags); +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags); +struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid); +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid); +void vzquota_kill_ugid(struct vz_quota_master *qmblk); +int vzquota_ugid_init(void); +void vzquota_ugid_release(void); +int vzquota_transfer_usage(struct inode *inode, int mask, + struct vz_quota_ilink *qlnk); +void vzquota_inode_off(struct inode *inode); + +long do_vzquotaugidctl(int cmd, unsigned int quota_id, + unsigned int ugid_index, unsigned int ugid_size, + void *addr, int compat); + +/* + * Other VZ quota parts + */ +extern struct dquot_operations vz_quota_operations; + +long do_vzquotactl(int cmd, unsigned int quota_id, + struct vz_quota_stat __user *qstat, const char __user *ve_root, + int compat); +int vzquota_proc_init(void); +void vzquota_proc_release(void); +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); + +void vzaquota_init(void); +void vzaquota_fini(void); + +#endif /* __KERNEL__ */ + +#endif /* _VZDQUOTA_H */ diff -urNp linux-2.6.32.48/include/linux/vzquota_qlnk.h linux-2.6.32.48-openvz/include/linux/vzquota_qlnk.h --- linux-2.6.32.48/include/linux/vzquota_qlnk.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzquota_qlnk.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,25 @@ +/* + * include/linux/vzquota_qlnk.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZDQUOTA_QLNK_H +#define _VZDQUOTA_QLNK_H + +struct vz_quota_master; +struct vz_quota_ugid; + +/* inode link, used to track inodes using quota via dq_ilink_list */ +struct vz_quota_ilink { + struct vz_quota_master *qmblk; + struct vz_quota_ugid *qugid[MAXQUOTAS]; + struct list_head list; + unsigned char origin[2]; +}; + +#endif /* _VZDQUOTA_QLNK_H */ diff -urNp linux-2.6.32.48/include/linux/vzratelimit.h linux-2.6.32.48-openvz/include/linux/vzratelimit.h --- linux-2.6.32.48/include/linux/vzratelimit.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzratelimit.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,28 @@ +/* + * include/linux/vzratelimit.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VZ_RATELIMIT_H__ +#define __VZ_RATELIMIT_H__ + +/* + * Generic ratelimiting stuff. + */ + +struct vz_rate_info { + int burst; + int interval; /* jiffy_t per event */ + int bucket; /* kind of leaky bucket */ + unsigned long last; /* last event */ +}; + +/* Return true if rate limit permits. */ +int vz_ratelimit(struct vz_rate_info *p); + +#endif /* __VZ_RATELIMIT_H__ */ diff -urNp linux-2.6.32.48/include/linux/vzstat.h linux-2.6.32.48-openvz/include/linux/vzstat.h --- linux-2.6.32.48/include/linux/vzstat.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/vzstat.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,182 @@ +/* + * include/linux/vzstat.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VZSTAT_H__ +#define __VZSTAT_H__ + +struct swap_cache_info_struct { + unsigned long add_total; + unsigned long del_total; + unsigned long find_success; + unsigned long find_total; + unsigned long noent_race; + unsigned long exist_race; + unsigned long remove_race; +}; + +struct kstat_lat_snap_struct { + cycles_t maxlat, totlat; + unsigned long count; +}; +struct kstat_lat_pcpu_snap_struct { + cycles_t maxlat, totlat; + unsigned long count; + seqcount_t lock; +} ____cacheline_aligned_in_smp; + +struct kstat_lat_struct { + struct kstat_lat_snap_struct cur, last; + cycles_t avg[3]; +}; +struct kstat_lat_pcpu_struct { + struct kstat_lat_pcpu_snap_struct *cur; + cycles_t max_snap; + struct kstat_lat_snap_struct last; + cycles_t avg[3]; +}; + +struct kstat_perf_snap_struct { + cycles_t wall_tottime, cpu_tottime; + cycles_t wall_maxdur, cpu_maxdur; + unsigned long count; +}; +struct kstat_perf_struct { + struct kstat_perf_snap_struct cur, last; +}; + +struct kstat_zone_avg { + unsigned long free_pages_avg[3], + nr_active_avg[3], + nr_inactive_avg[3]; +}; + +#define KSTAT_ALLOCSTAT_NR 5 + +struct kernel_stat_glob { + unsigned long nr_unint_avg[3]; + + unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR]; + struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR]; + struct kstat_lat_pcpu_struct sched_lat; + struct kstat_lat_struct swap_in; + + struct kstat_perf_struct ttfp, cache_reap, + refill_inact, shrink_icache, shrink_dcache; + + struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */ +} ____cacheline_aligned; + +extern struct kernel_stat_glob kstat_glob ____cacheline_aligned; +extern spinlock_t kstat_glb_lock; + +#ifdef CONFIG_VE +#define KSTAT_PERF_ENTER(name) \ + unsigned long flags; \ + cycles_t start, sleep_time; \ + \ + start = get_cycles(); \ + sleep_time = VE_TASK_INFO(current)->sleep_time; \ + +#define KSTAT_PERF_LEAVE(name) \ + spin_lock_irqsave(&kstat_glb_lock, flags); \ + kstat_glob.name.cur.count++; \ + start = get_cycles() - start; \ + if (kstat_glob.name.cur.wall_maxdur < start) \ + kstat_glob.name.cur.wall_maxdur = start;\ + kstat_glob.name.cur.wall_tottime += start; \ + start -= VE_TASK_INFO(current)->sleep_time - \ + sleep_time; \ + if (kstat_glob.name.cur.cpu_maxdur < start) \ + kstat_glob.name.cur.cpu_maxdur = start; \ + kstat_glob.name.cur.cpu_tottime += start; \ + spin_unlock_irqrestore(&kstat_glb_lock, flags); \ + +#else +#define KSTAT_PERF_ENTER(name) +#define KSTAT_PERF_LEAVE(name) +#endif + +/* + * Add another statistics reading. + * Serialization is the caller's due. + */ +static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p, + cycles_t dur) +{ + p->cur.count++; + if (p->cur.maxlat < dur) + p->cur.maxlat = dur; + p->cur.totlat += dur; +} + +static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, + cycles_t dur) +{ + struct kstat_lat_pcpu_snap_struct *cur; + + cur = per_cpu_ptr(p->cur, cpu); + write_seqcount_begin(&cur->lock); + cur->count++; + if (cur->maxlat < dur) + cur->maxlat = dur; + cur->totlat += dur; + write_seqcount_end(&cur->lock); +} + +/* + * Move current statistics to last, clear last. + * Serialization is the caller's due. + */ +static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p) +{ + cycles_t m; + memcpy(&p->last, &p->cur, sizeof(p->last)); + p->cur.maxlat = 0; + m = p->last.maxlat; + CALC_LOAD(p->avg[0], EXP_1, m) + CALC_LOAD(p->avg[1], EXP_5, m) + CALC_LOAD(p->avg[2], EXP_15, m) +} + +static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p) +{ + unsigned i, cpu; + struct kstat_lat_pcpu_snap_struct snap, *cur; + cycles_t m; + + memset(&p->last, 0, sizeof(p->last)); + for_each_online_cpu(cpu) { + cur = per_cpu_ptr(p->cur, cpu); + do { + i = read_seqcount_begin(&cur->lock); + memcpy(&snap, cur, sizeof(snap)); + } while (read_seqcount_retry(&cur->lock, i)); + /* + * read above and this update of maxlat is not atomic, + * but this is OK, since it happens rarely and losing + * a couple of peaks is not essential. xemul + */ + cur->maxlat = 0; + + p->last.count += snap.count; + p->last.totlat += snap.totlat; + if (p->last.maxlat < snap.maxlat) + p->last.maxlat = snap.maxlat; + } + + m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap); + CALC_LOAD(p->avg[0], EXP_1, m); + CALC_LOAD(p->avg[1], EXP_5, m); + CALC_LOAD(p->avg[2], EXP_15, m); + /* reset max_snap to calculate it correctly next time */ + p->max_snap = 0; +} + +#endif /* __VZSTAT_H__ */ diff -urNp linux-2.6.32.48/include/linux/xattr.h linux-2.6.32.48-openvz/include/linux/xattr.h --- linux-2.6.32.48/include/linux/xattr.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/linux/xattr.h 2011-11-21 17:40:47.000000000 -0500 @@ -10,6 +10,13 @@ #ifndef _LINUX_XATTR_H #define _LINUX_XATTR_H +#ifdef CONFIG_VE +extern int ve_xattr_policy; +#define VE_XATTR_POLICY_ACCEPT 0 +#define VE_XATTR_POLICY_IGNORE 1 +#define VE_XATTR_POLICY_REJECT 2 +#endif + #define XATTR_CREATE 0x1 /* set value, fail if attr already exists */ #define XATTR_REPLACE 0x2 /* set value, fail if attr does not exist */ diff -urNp linux-2.6.32.48/include/net/addrconf.h linux-2.6.32.48-openvz/include/net/addrconf.h --- linux-2.6.32.48/include/net/addrconf.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/addrconf.h 2011-11-21 17:40:47.000000000 -0500 @@ -262,5 +262,9 @@ extern int if6_proc_init(void); extern void if6_proc_exit(void); #endif +int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, + unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, + __u32 valid_lft); + #endif #endif diff -urNp linux-2.6.32.48/include/net/af_unix.h linux-2.6.32.48-openvz/include/net/af_unix.h --- linux-2.6.32.48/include/net/af_unix.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/af_unix.h 2011-11-21 17:40:47.000000000 -0500 @@ -11,6 +11,8 @@ extern void unix_notinflight(struct file extern void unix_gc(void); extern void wait_for_unix_gc(void); extern struct sock *unix_get_socket(struct file *filp); +extern void unix_destruct_fds(struct sk_buff *skb); + #define UNIX_HASH_SIZE 256 diff -urNp linux-2.6.32.48/include/net/flow.h linux-2.6.32.48-openvz/include/net/flow.h --- linux-2.6.32.48/include/net/flow.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/flow.h 2011-11-21 17:40:47.000000000 -0500 @@ -10,6 +10,7 @@ #include #include +struct ve_struct; struct flowi { int oif; int iif; @@ -77,6 +78,9 @@ struct flowi { #define fl_icmp_code uli_u.icmpt.code #define fl_ipsec_spi uli_u.spi #define fl_mh_type uli_u.mht.type +#ifdef CONFIG_VE + struct ve_struct *owner_env; +#endif __u32 secid; /* used by xfrm; see secid.txt */ } __attribute__((__aligned__(BITS_PER_LONG/8))); diff -urNp linux-2.6.32.48/include/net/inet_frag.h linux-2.6.32.48-openvz/include/net/inet_frag.h --- linux-2.6.32.48/include/net/inet_frag.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/inet_frag.h 2011-11-21 17:40:47.000000000 -0500 @@ -15,6 +15,9 @@ struct netns_frags { struct inet_frag_queue { struct hlist_node list; struct netns_frags *net; +#ifdef CONFIG_VE + struct ve_struct *owner_ve; +#endif struct list_head lru_list; /* lru list member */ spinlock_t lock; atomic_t refcnt; diff -urNp linux-2.6.32.48/include/net/inet_timewait_sock.h linux-2.6.32.48-openvz/include/net/inet_timewait_sock.h --- linux-2.6.32.48/include/net/inet_timewait_sock.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/inet_timewait_sock.h 2011-11-21 17:40:47.000000000 -0500 @@ -82,6 +82,7 @@ struct inet_timewait_death_row { struct inet_hashinfo *hashinfo; int sysctl_tw_recycle; int sysctl_max_tw_buckets; + int ub_managed; }; extern void inet_twdr_hangman(unsigned long data); @@ -138,6 +139,7 @@ struct inet_timewait_sock { unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; + envid_t tw_owner_env; }; static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, diff -urNp linux-2.6.32.48/include/net/ip6_fib.h linux-2.6.32.48-openvz/include/net/ip6_fib.h --- linux-2.6.32.48/include/net/ip6_fib.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/ip6_fib.h 2011-11-21 17:40:47.000000000 -0500 @@ -162,6 +162,7 @@ struct fib6_table { u32 tb6_id; rwlock_t tb6_lock; struct fib6_node tb6_root; + struct ve_struct *owner_env; }; #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC diff -urNp linux-2.6.32.48/include/net/netfilter/ipv6/nf_conntrack_ipv6.h linux-2.6.32.48-openvz/include/net/netfilter/ipv6/nf_conntrack_ipv6.h --- linux-2.6.32.48/include/net/netfilter/ipv6/nf_conntrack_ipv6.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/netfilter/ipv6/nf_conntrack_ipv6.h 2011-11-21 17:40:47.000000000 -0500 @@ -9,7 +9,7 @@ extern struct nf_conntrack_l4proto nf_co extern int nf_ct_frag6_init(void); extern void nf_ct_frag6_cleanup(void); -extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user); +extern struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user); extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, struct net_device *in, struct net_device *out, diff -urNp linux-2.6.32.48/include/net/netfilter/nf_conntrack_expect.h linux-2.6.32.48-openvz/include/net/netfilter/nf_conntrack_expect.h --- linux-2.6.32.48/include/net/netfilter/nf_conntrack_expect.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/netfilter/nf_conntrack_expect.h 2011-11-21 17:40:47.000000000 -0500 @@ -81,6 +81,8 @@ void nf_conntrack_expect_fini(struct net struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_tuple *tuple); +void nf_ct_expect_insert(struct nf_conntrack_expect *exp); + struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_tuple *tuple); diff -urNp linux-2.6.32.48/include/net/netfilter/nf_conntrack.h linux-2.6.32.48-openvz/include/net/netfilter/nf_conntrack.h --- linux-2.6.32.48/include/net/netfilter/nf_conntrack.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/netfilter/nf_conntrack.h 2011-11-21 17:40:47.000000000 -0500 @@ -270,6 +270,7 @@ extern struct nf_conn * nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, + struct user_beancounter *, gfp_t gfp); /* It's confirmed if it is, or has been in the hash table. */ @@ -291,6 +292,7 @@ static inline int nf_ct_is_untracked(con extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); extern unsigned int nf_conntrack_htable_size; extern unsigned int nf_conntrack_max; +extern int ip_conntrack_disable_ve0 /* XXX: unused */; #define NF_CT_STAT_INC(net, count) \ (per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++) diff -urNp linux-2.6.32.48/include/net/netfilter/nf_nat.h linux-2.6.32.48-openvz/include/net/netfilter/nf_nat.h --- linux-2.6.32.48/include/net/netfilter/nf_nat.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/netfilter/nf_nat.h 2011-11-21 17:40:47.000000000 -0500 @@ -78,6 +78,8 @@ struct nf_conn_nat #endif }; +void nf_nat_hash_conntrack(struct net *net, struct nf_conn *ct); + /* Set up the info structure to map into this range. */ extern unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, diff -urNp linux-2.6.32.48/include/net/netlink_sock.h linux-2.6.32.48-openvz/include/net/netlink_sock.h --- linux-2.6.32.48/include/net/netlink_sock.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/netlink_sock.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,23 @@ +#ifndef __NET_NETLINK_SOCK_H +#define __NET_NETLINK_SOCK_H + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 pid; + u32 dst_pid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + wait_queue_head_t wait; + struct netlink_callback *cb; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; + void (*netlink_rcv)(struct sk_buff *skb); + struct module *module; +}; + +#endif /* __NET_NETLINK_SOCK_H */ diff -urNp linux-2.6.32.48/include/net/net_namespace.h linux-2.6.32.48-openvz/include/net/net_namespace.h --- linux-2.6.32.48/include/net/net_namespace.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/net_namespace.h 2011-11-21 17:40:47.000000000 -0500 @@ -53,6 +53,13 @@ struct net { struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + int ifindex; + +#ifdef CONFIG_VE + struct completion *sysfs_completion; + struct ve_struct *owner_ve; +#endif + /* core fib_rules */ struct list_head rules_ops; spinlock_t rules_mod_lock; diff -urNp linux-2.6.32.48/include/net/netns/ipv6.h linux-2.6.32.48-openvz/include/net/netns/ipv6.h --- linux-2.6.32.48/include/net/netns/ipv6.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/netns/ipv6.h 2011-11-21 17:40:47.000000000 -0500 @@ -14,6 +14,7 @@ struct netns_sysctl_ipv6 { #ifdef CONFIG_SYSCTL struct ctl_table_header *table; struct ctl_table_header *frags_hdr; + struct ctl_table_header *nf_frags_hdr; #endif int bindv6only; int flush_delay; @@ -32,6 +33,7 @@ struct netns_ipv6 { struct ipv6_devconf *devconf_all; struct ipv6_devconf *devconf_dflt; struct netns_frags frags; + struct netns_frags ct_frags; #ifdef CONFIG_NETFILTER struct xt_table *ip6table_filter; struct xt_table *ip6table_mangle; diff -urNp linux-2.6.32.48/include/net/route.h linux-2.6.32.48-openvz/include/net/route.h --- linux-2.6.32.48/include/net/route.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/route.h 2011-11-21 17:40:47.000000000 -0500 @@ -138,6 +138,7 @@ static inline void ip_rt_put(struct rtab #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) extern const __u8 ip_tos2prio[16]; +extern int ip_rt_src_check; static inline char rt_tos2priority(u8 tos) { diff -urNp linux-2.6.32.48/include/net/sock.h linux-2.6.32.48-openvz/include/net/sock.h --- linux-2.6.32.48/include/net/sock.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/sock.h 2011-11-21 17:40:47.000000000 -0500 @@ -60,6 +60,8 @@ #include #include +#include + /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of @@ -301,6 +303,8 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + struct sock_beancounter sk_bc; + struct ve_struct *owner_env; }; /* @@ -591,6 +595,8 @@ static inline int sk_backlog_rcv(struct }) extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); +extern int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, + unsigned long amount); extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); extern void sk_stream_wait_close(struct sock *sk, long timeo_p); extern int sk_stream_error(struct sock *sk, int flags, int err); @@ -828,7 +834,8 @@ static inline int sk_has_account(struct return !!sk->sk_prot->memory_allocated; } -static inline int sk_wmem_schedule(struct sock *sk, int size) +static inline int sk_wmem_schedule(struct sock *sk, int size, + struct sk_buff *skb) { if (!sk_has_account(sk)) return 1; @@ -836,12 +843,15 @@ static inline int sk_wmem_schedule(struc __sk_mem_schedule(sk, size, SK_MEM_SEND); } -static inline int sk_rmem_schedule(struct sock *sk, int size) +static inline int sk_rmem_schedule(struct sock *sk, struct sk_buff *skb) { if (!sk_has_account(sk)) return 1; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_RECV); + if (!(skb->truesize <= sk->sk_forward_alloc || + __sk_mem_schedule(sk, skb->truesize, SK_MEM_RECV))) + return 0; + + return !ub_sockrcvbuf_charge(sk, skb); } static inline void sk_mem_reclaim(struct sock *sk) @@ -965,6 +975,11 @@ extern struct sk_buff *sock_alloc_send unsigned long data_len, int noblock, int *errcode); +extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk, + unsigned long size, + unsigned long size2, + int noblock, + int *errcode); extern void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); @@ -1327,6 +1342,7 @@ static inline void sock_poll_wait(struct static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { + WARN_ON(skb->destructor); skb_orphan(skb); skb->sk = sk; skb->destructor = sock_wfree; @@ -1340,6 +1356,7 @@ static inline void skb_set_owner_w(struc static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { + WARN_ON(skb->destructor); skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rfree; @@ -1562,6 +1579,13 @@ static inline void sk_change_net(struct sock_net_set(sk, hold_net(net)); } +static inline void sk_change_net_get(struct sock *sk, struct net *net) +{ + struct net *old_net = sock_net(sk); + sock_net_set(sk, get_net(net)); + put_net(old_net); +} + static inline struct sock *skb_steal_sock(struct sk_buff *skb) { if (unlikely(skb->sk)) { diff -urNp linux-2.6.32.48/include/net/tcp.h linux-2.6.32.48-openvz/include/net/tcp.h --- linux-2.6.32.48/include/net/tcp.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/include/net/tcp.h 2011-11-21 17:40:47.000000000 -0500 @@ -44,6 +44,13 @@ #include #include +#include + +#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +#define TCP_OFF(sk) (sk->sk_sndmsg_off) + +#define TW_WSCALE_MASK 0x0f +#define TW_WSCALE_SPEC 0x10 extern struct inet_hashinfo tcp_hashinfo; @@ -222,7 +229,9 @@ extern int sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; +#ifndef sysctl_tcp_adv_win_scale extern int sysctl_tcp_adv_win_scale; +#endif extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_frto_response; @@ -237,6 +246,10 @@ extern int sysctl_tcp_base_mss; extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_max_ssthresh; +extern int sysctl_tcp_use_sg; +extern int sysctl_tcp_max_tw_kmem_fraction; +extern int sysctl_tcp_max_tw_buckets_ub; + extern atomic_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -259,9 +272,9 @@ static inline int between(__u32 seq1, __ return seq3 - seq2 >= seq1 - seq2; } -static inline bool tcp_too_many_orphans(struct sock *sk, int shift) +static inline bool tcp_too_many_orphans(struct sock *sk, + struct percpu_counter *ocp, int shift) { - struct percpu_counter *ocp = sk->sk_prot->orphan_count; int orphans = percpu_counter_read_positive(ocp); if (orphans << shift > sysctl_tcp_max_orphans) { @@ -616,7 +629,11 @@ extern u32 __tcp_select_window(struct so * to use only the low 32-bits of jiffies and hide the ugly * casts with the following macro. */ +#ifdef CONFIG_VE +#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup)) +#else #define tcp_time_stamp ((__u32)(jiffies)) +#endif /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission diff -urNp linux-2.6.32.48/init/calibrate.c linux-2.6.32.48-openvz/init/calibrate.c --- linux-2.6.32.48/init/calibrate.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/init/calibrate.c 2011-11-21 17:40:47.000000000 -0500 @@ -9,6 +9,7 @@ #include #include #include +#include unsigned long lpj_fine; unsigned long preset_lpj; @@ -108,6 +109,60 @@ static unsigned long __cpuinit calibrate static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} #endif +unsigned long cycles_per_jiffy, cycles_per_clock; + +static __devinit void calibrate_cycles(void) +{ + unsigned long ticks; + cycles_t time; + + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + time = get_cycles(); + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + + time = get_cycles() - time; + cycles_per_jiffy = time; + if ((time >> 32) != 0) { + printk("CPU too fast! timings are incorrect\n"); + cycles_per_jiffy = -1; + } +} + +EXPORT_SYMBOL(cycles_per_jiffy); +EXPORT_SYMBOL(cycles_per_clock); + +static __devinit void calc_cycles_per_jiffy(void) +{ +#if 0 + extern unsigned long fast_gettimeoffset_quotient; + unsigned long low, high; + + if (fast_gettimeoffset_quotient != 0) { + __asm__("divl %2" + :"=a" (low), "=d" (high) + :"r" (fast_gettimeoffset_quotient), + "0" (0), "1" (1000000/HZ)); + + cycles_per_jiffy = low; + } +#endif + if (cycles_per_jiffy == 0) + calibrate_cycles(); + + if (cycles_per_jiffy == 0) { + printk(KERN_WARNING "Cycles are stuck! " + "Some statistics will not be available."); + /* to prevent division by zero in cycles_to_(clocks|jiffies) */ + cycles_per_jiffy = 1; + cycles_per_clock = 1; + } else + cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC); +} + /* * This is the number of bits of precision for the loops_per_jiffy. Each * bit takes on average 1.5/HZ seconds. This (like the original) is a little @@ -178,5 +233,6 @@ void __cpuinit calibrate_delay(void) loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); + calc_cycles_per_jiffy(); printed = true; } diff -urNp linux-2.6.32.48/init/Kconfig linux-2.6.32.48-openvz/init/Kconfig --- linux-2.6.32.48/init/Kconfig 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/init/Kconfig 2011-11-21 17:40:47.000000000 -0500 @@ -279,7 +279,7 @@ config TASK_XACCT config TASK_IO_ACCOUNTING bool "Enable per-task storage I/O accounting (EXPERIMENTAL)" - depends on TASK_XACCT + depends on TASK_XACCT && BEANCOUNTERS help Collect information on the number of bytes of storage I/O which this task has caused. @@ -454,7 +454,7 @@ config CGROUP_DEBUG config CGROUP_NS bool "Namespace cgroup subsystem" - depends on CGROUPS + depends on CGROUPS && !VE help Provides a simple namespace cgroup subsystem to provide hierarchical naming of sets of namespaces, @@ -463,7 +463,7 @@ config CGROUP_NS config CGROUP_FREEZER bool "Freezer cgroup subsystem" - depends on CGROUPS + depends on CGROUPS && !VE help Provides a way to freeze and unfreeze all tasks in a cgroup. @@ -477,7 +477,7 @@ config CGROUP_DEVICE config CPUSETS bool "Cpuset support" - depends on CGROUPS + depends on CGROUPS && !VE help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and @@ -493,13 +493,14 @@ config PROC_PID_CPUSET config CGROUP_CPUACCT bool "Simple CPU accounting cgroup subsystem" - depends on CGROUPS + depends on CGROUPS && !VE help Provides a simple Resource Controller for monitoring the total CPU consumed by the tasks in a cgroup. config RESOURCE_COUNTERS bool "Resource counters" + depends on !BEANCOUNTERS help This option enables controller independent resource accounting infrastructure that works with cgroups. @@ -556,16 +557,22 @@ menuconfig CGROUP_SCHED tasks. if CGROUP_SCHED + +config VZ_FAIRSCHED + bool "OpenVZ groups" + depends on CGROUP_SCHED + default CGROUP_SCHED + help + This option add customizable task groups with OpenVZ compatible + syscall and procfs interface. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED - default CGROUP_SCHED config RT_GROUP_SCHED bool "Group scheduling for SCHED_RR/FIFO" - depends on EXPERIMENTAL depends on CGROUP_SCHED - default n help This feature lets you explicitly allocate real CPU bandwidth to task groups. If enabled, it will also make it impossible to @@ -616,6 +623,16 @@ config SYSFS_DEPRECATED_V2 if the original kernel, that came with your distribution, has this option set to N. +config SYSFS_DEPRECATED_DYN + bool "make deprecated sysfs layout dynamically" + depends on SYSFS + default y + select SYSFS_DEPRECATED_V2 + help + This option works like the DEPRECATED_V2 but allows selecting the + sysfs layout dynamically, i.e. on boot. To select the old + (deprecated) layout, supply old_sysfs kernel boot parameter. + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help diff -urNp linux-2.6.32.48/init/main.c linux-2.6.32.48-openvz/init/main.c --- linux-2.6.32.48/init/main.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/init/main.c 2011-11-21 17:40:47.000000000 -0500 @@ -70,6 +70,9 @@ #include #include #include +#include + +#include #include #include @@ -101,6 +104,16 @@ extern void tc_init(void); enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state); +#ifdef CONFIG_VE +extern void init_ve_system(void); +extern void init_ve0(void); +extern void prepare_ve0_process(struct task_struct *tsk); +#else +#define init_ve_system() do { } while (0) +#define init_ve0() do { } while (0) +#define prepare_ve0_process(tsk) do { } while (0) +#endif + /* * Boot command-line arguments */ @@ -524,6 +537,8 @@ asmlinkage void __init start_kernel(void smp_setup_processor_id(); + prepare_ve0_process(&init_task); + /* * Need to run as early as possible, to initialize the * lockdep hash: @@ -556,6 +571,8 @@ asmlinkage void __init start_kernel(void setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); + init_ve0(); + ub_init_early(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ build_all_zonelists(); @@ -663,6 +680,7 @@ asmlinkage void __init start_kernel(void cred_init(); fork_init(totalram_pages); proc_caches_init(); + ub_init_late(); buffer_init(); key_init(); security_init(); @@ -686,6 +704,10 @@ asmlinkage void __init start_kernel(void ftrace_init(); +#ifdef CONFIG_BC_RSS_ACCOUNTING + ub_init_pbc(); +#endif + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -776,6 +798,7 @@ static void __init do_initcalls(void) */ static void __init do_basic_setup(void) { + init_ve_system(); init_workqueues(); cpuset_init_smp(); usermodehelper_init(); @@ -881,6 +904,7 @@ static int __init kernel_init(void * unu start_boot_trace(); smp_init(); + fairsched_init_late(); sched_init_smp(); do_basic_setup(); diff -urNp linux-2.6.32.48/init/version.c linux-2.6.32.48-openvz/init/version.c --- linux-2.6.32.48/init/version.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/init/version.c 2011-11-21 17:40:47.000000000 -0500 @@ -36,6 +36,12 @@ struct uts_namespace init_uts_ns = { }; EXPORT_SYMBOL_GPL(init_uts_ns); +struct new_utsname virt_utsname = { + /* we need only this field */ + .release = UTS_RELEASE, +}; +EXPORT_SYMBOL(virt_utsname); + /* FIXED STRINGS! Don't touch! */ const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" diff -urNp linux-2.6.32.48/ipc/ipc_sysctl.c linux-2.6.32.48-openvz/ipc/ipc_sysctl.c --- linux-2.6.32.48/ipc/ipc_sysctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/ipc/ipc_sysctl.c 2011-11-21 17:40:47.000000000 -0500 @@ -270,19 +270,14 @@ static struct ctl_table ipc_kern_table[] {} }; -static struct ctl_table ipc_root_table[] = { - { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0555, - .child = ipc_kern_table, - }, +static struct ctl_path ipc_path[] = { + { .ctl_name = CTL_KERN, .procname = "kernel", }, {} }; static int __init ipc_sysctl_init(void) { - register_sysctl_table(ipc_root_table); + register_sysctl_glob_paths(ipc_path, ipc_kern_table, 1); return 0; } diff -urNp linux-2.6.32.48/ipc/msg.c linux-2.6.32.48-openvz/ipc/msg.c --- linux-2.6.32.48/ipc/msg.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/ipc/msg.c 2011-11-21 17:40:47.000000000 -0500 @@ -184,6 +184,7 @@ static int newque(struct ipc_namespace * int id, retval; key_t key = params->key; int msgflg = params->flg; + int msqid = params->id; msq = ipc_rcu_alloc(sizeof(*msq)); if (!msq) @@ -202,7 +203,7 @@ static int newque(struct ipc_namespace * /* * ipc_addid() locks msq */ - id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); + id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, msqid); if (id < 0) { security_msg_queue_free(msq); ipc_rcu_putref(msq); @@ -324,6 +325,7 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msg_params.key = key; msg_params.flg = msgflg; + msg_params.id = -1; return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); } @@ -943,3 +945,55 @@ static int sysvipc_msg_proc_show(struct msq->q_ctime); } #endif + +#ifdef CONFIG_VE +#include + +int sysvipc_setup_msg(key_t key, int msqid, int msgflg) +{ + struct ipc_namespace *ns; + struct ipc_ops msg_ops; + struct ipc_params msg_params; + + ns = current->nsproxy->ipc_ns; + + msg_ops.getnew = newque; + msg_ops.associate = msg_security; + msg_ops.more_checks = NULL; + + msg_params.key = key; + msg_params.flg = msgflg | IPC_CREAT; + msg_params.id = msqid; + + return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); +} +EXPORT_SYMBOL_GPL(sysvipc_setup_msg); + +int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg) +{ + int err = 0; + struct msg_queue * msq; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&msg_ids(ns).rw_mutex); + in_use = msg_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + msq = idr_find(&msg_ids(ns).ipcs_idr, next_id); + if (msq == NULL) + continue; + ipc_lock_by_ptr(&msq->q_perm); + err = func(ipc_buildid(next_id, msq->q_perm.seq), msq, arg); + msg_unlock(msq); + if (err) + break; + total++; + } + up_write(&msg_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_msg); +#endif diff -urNp linux-2.6.32.48/ipc/msgutil.c linux-2.6.32.48-openvz/ipc/msgutil.c --- linux-2.6.32.48/ipc/msgutil.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/ipc/msgutil.c 2011-11-21 17:40:47.000000000 -0500 @@ -8,6 +8,7 @@ * See the file COPYING for more details. */ +#include #include #include #include @@ -18,6 +19,8 @@ #include "util.h" +#include + DEFINE_SPINLOCK(mq_lock); /* @@ -44,52 +47,53 @@ struct msg_msgseg { #define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) #define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) -struct msg_msg *load_msg(const void __user *src, int len) +struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset, + void * data), int len, void * data) { struct msg_msg *msg; struct msg_msgseg **pseg; int err; int alen; + int offset = 0; alen = len; if (alen > DATALEN_MSG) alen = DATALEN_MSG; - msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_UBC); if (msg == NULL) return ERR_PTR(-ENOMEM); msg->next = NULL; msg->security = NULL; - if (copy_from_user(msg + 1, src, alen)) { + if (load(msg + 1, alen, offset, data)) { err = -EFAULT; goto out_err; } len -= alen; - src = ((char __user *)src) + alen; + offset += alen; pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; alen = len; if (alen > DATALEN_SEG) alen = DATALEN_SEG; - seg = kmalloc(sizeof(*seg) + alen, - GFP_KERNEL); + seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_UBC); if (seg == NULL) { err = -ENOMEM; goto out_err; } *pseg = seg; seg->next = NULL; - if (copy_from_user(seg + 1, src, alen)) { + if (load(seg + 1, alen, offset, data)) { err = -EFAULT; goto out_err; } pseg = &seg->next; len -= alen; - src = ((char __user *)src) + alen; + offset += alen; } err = security_msg_msg_alloc(msg); @@ -102,33 +106,58 @@ out_err: free_msg(msg); return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(sysv_msg_load); -int store_msg(void __user *dest, struct msg_msg *msg, int len) +static int do_load_msg(void * dst, int len, int offset, void * data) +{ + return copy_from_user(dst, data + offset, len); +} + +struct msg_msg *load_msg(const void __user *src, int len) +{ + return sysv_msg_load(do_load_msg, len, (void*)src); +} + +int sysv_msg_store(struct msg_msg *msg, + int (*store)(void * src, int len, int offset, void * data), + int len, void * data) { int alen; + int offset = 0; struct msg_msgseg *seg; - + alen = len; if (alen > DATALEN_MSG) alen = DATALEN_MSG; - if (copy_to_user(dest, msg + 1, alen)) + if (store(msg + 1, alen, offset, data)) return -1; len -= alen; - dest = ((char __user *)dest) + alen; + offset += alen; seg = msg->next; while (len > 0) { alen = len; if (alen > DATALEN_SEG) alen = DATALEN_SEG; - if (copy_to_user(dest, seg + 1, alen)) + if (store(seg + 1, alen, offset, data)) return -1; len -= alen; - dest = ((char __user *)dest) + alen; + offset += alen; seg = seg->next; } return 0; } +EXPORT_SYMBOL_GPL(sysv_msg_store); + +static int do_store_msg(void * src, int len, int offset, void * data) +{ + return copy_to_user(data + offset, src, len); +} + +int store_msg(void __user *dest, struct msg_msg *msg, int len) +{ + return sysv_msg_store(msg, do_store_msg, len, dest); +} void free_msg(struct msg_msg *msg) { diff -urNp linux-2.6.32.48/ipc/sem.c linux-2.6.32.48-openvz/ipc/sem.c --- linux-2.6.32.48/ipc/sem.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/ipc/sem.c 2011-11-21 17:40:47.000000000 -0500 @@ -87,6 +87,8 @@ #include #include "util.h" +#include + #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) @@ -241,6 +243,7 @@ static int newary(struct ipc_namespace * key_t key = params->key; int nsems = params->u.nsems; int semflg = params->flg; + int semid = params->id; if (!nsems) return -EINVAL; @@ -264,7 +267,7 @@ static int newary(struct ipc_namespace * return retval; } - id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); + id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, semid); if (id < 0) { security_sem_free(sma); ipc_rcu_putref(sma); @@ -327,6 +330,7 @@ SYSCALL_DEFINE3(semget, key_t, key, int, sem_params.key = key; sem_params.flg = semflg; sem_params.u.nsems = nsems; + sem_params.id = -1; return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } @@ -951,7 +955,7 @@ static inline int get_undo_list(struct s undo_list = current->sysvsem.undo_list; if (!undo_list) { - undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_UBC); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); @@ -1016,7 +1020,8 @@ static struct sem_undo *find_alloc_undo( sem_getref_and_unlock(sma); /* step 2: allocate new undo structure */ - new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); + new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, + GFP_KERNEL_UBC); if (!new) { sem_putref(sma); return ERR_PTR(-ENOMEM); @@ -1078,7 +1083,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, if (nsops > ns->sc_semopm) return -E2BIG; if(nsops > SEMOPM_FAST) { - sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); + sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_UBC); if(sops==NULL) return -ENOMEM; } @@ -1381,3 +1386,57 @@ static int sysvipc_sem_proc_show(struct sma->sem_ctime); } #endif + +#ifdef CONFIG_VE +#include + +int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg) +{ + struct ipc_namespace *ns; + struct ipc_ops sem_ops; + struct ipc_params sem_params; + + ns = current->nsproxy->ipc_ns; + + sem_ops.getnew = newary; + sem_ops.associate = sem_security; + sem_ops.more_checks = sem_more_checks; + + sem_params.key = key; + sem_params.flg = semflg | IPC_CREAT; + sem_params.u.nsems = size; + sem_params.id = semid; + + return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); +} +EXPORT_SYMBOL_GPL(sysvipc_setup_sem); + +int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg) +{ + int err = 0; + struct sem_array *sma; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&sem_ids(ns).rw_mutex); + in_use = sem_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + sma = idr_find(&sem_ids(ns).ipcs_idr, next_id); + if (sma == NULL) + continue; + ipc_lock_by_ptr(&sma->sem_perm); + err = func(ipc_buildid(next_id, sma->sem_perm.seq), sma, arg); + sem_unlock(sma); + if (err) + break; + total++; + } + up_write(&sem_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_sem); +EXPORT_SYMBOL_GPL(exit_sem); +#endif diff -urNp linux-2.6.32.48/ipc/shm.c linux-2.6.32.48-openvz/ipc/shm.c --- linux-2.6.32.48/ipc/shm.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/ipc/shm.c 2011-11-21 17:40:47.000000000 -0500 @@ -40,27 +40,17 @@ #include #include #include +#include #include -#include "util.h" - -struct shm_file_data { - int id; - struct ipc_namespace *ns; - struct file *file; - const struct vm_operations_struct *vm_ops; -}; +#include +#include -#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) +#include "util.h" -static const struct file_operations shm_file_operations; static const struct vm_operations_struct shm_vm_ops; -#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) - -#define shm_unlock(shp) \ - ipc_unlock(&(shp)->shm_perm) static int newseg(struct ipc_namespace *, struct ipc_params *); static void shm_open(struct vm_area_struct *vma); @@ -113,20 +103,6 @@ void __init shm_init (void) IPC_SHM_IDS, sysvipc_shm_proc_show); } -/* - * shm_lock_(check_) routines are called in the paths where the rw_mutex - * is not necessarily held. - */ -static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; - - return container_of(ipcp, struct shmid_kernel, shm_perm); -} - static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, int id) { @@ -295,7 +271,7 @@ static unsigned long shm_get_unmapped_ar pgoff, flags); } -static const struct file_operations shm_file_operations = { +const struct file_operations shm_file_operations = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, @@ -307,6 +283,7 @@ static const struct file_operations shm_ .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, }; +EXPORT_SYMBOL_GPL(shm_file_operations); int is_file_shm_hugepages(struct file *file) { @@ -336,11 +313,12 @@ static int newseg(struct ipc_namespace * key_t key = params->key; int shmflg = params->flg; size_t size = params->u.size; + int shmid = params->id; int error; struct shmid_kernel *shp; int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; struct file * file; - char name[13]; + char name[64]; int id; int acctflag = 0; @@ -365,7 +343,7 @@ static int newseg(struct ipc_namespace * return error; } - sprintf (name, "SYSV%08x", key); + snprintf (name, sizeof(name), "VE%d-SYSV%08x", VEID(get_exec_env()), key); if (shmflg & SHM_HUGETLB) { /* hugetlb_file_setup applies strict accounting */ if (shmflg & SHM_NORESERVE) @@ -386,7 +364,7 @@ static int newseg(struct ipc_namespace * if (IS_ERR(file)) goto no_file; - id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); + id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, shmid); if (id < 0) { error = id; goto no_id; @@ -461,6 +439,7 @@ SYSCALL_DEFINE3(shmget, key_t, key, size shm_params.key = key; shm_params.flg = shmflg; shm_params.u.size = size; + shm_params.id = -1; return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); } @@ -1100,3 +1079,67 @@ static int sysvipc_shm_proc_show(struct shp->shm_ctim); } #endif + +#ifdef CONFIG_VE +#include + +struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg) +{ + struct ipc_namespace *ns; + struct ipc_ops shm_ops; + struct ipc_params shm_params; + struct shmid_kernel *shp; + struct file *file; + int rv; + + ns = current->nsproxy->ipc_ns; + + shm_ops.getnew = newseg; + shm_ops.associate = shm_security; + shm_ops.more_checks = shm_more_checks; + + shm_params.key = key; + shm_params.flg = shmflg | IPC_CREAT; + shm_params.u.size = size; + shm_params.id = shmid; + + rv = ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); + if (rv < 0) + return ERR_PTR(rv); + shp = shm_lock(ns, rv); + BUG_ON(IS_ERR(shp)); + file = shp->shm_file; + get_file(file); + shm_unlock(shp); + return file; +} +EXPORT_SYMBOL_GPL(sysvipc_setup_shm); + +int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg) +{ + int err = 0; + struct shmid_kernel* shp; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&shm_ids(ns).rw_mutex); + in_use = shm_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); + if (shp == NULL) + continue; + ipc_lock_by_ptr(&shp->shm_perm); + err = func(shp, arg); + shm_unlock(shp); + if (err) + break; + total++; + } + up_write(&shm_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_shm); +#endif diff -urNp linux-2.6.32.48/ipc/util.c linux-2.6.32.48-openvz/ipc/util.c --- linux-2.6.32.48/ipc/util.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/ipc/util.c 2011-11-21 17:40:47.000000000 -0500 @@ -38,6 +38,8 @@ #include +#include + #include "util.h" struct ipc_proc_iface { @@ -238,6 +240,7 @@ int ipc_get_maxid(struct ipc_ids *ids) * @ids: IPC identifier set * @new: new IPC permission set * @size: limit for the number of used ids + * @reqid: if >= 0, get this id exactly. If -1 -- don't care. * * Add an entry 'new' to the IPC ids idr. The permissions object is * initialised and the first free entry is set up and the id assigned @@ -247,7 +250,7 @@ int ipc_get_maxid(struct ipc_ids *ids) * Called with ipc_ids.rw_mutex held as a writer. */ -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) +int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid) { uid_t euid; gid_t egid; @@ -264,7 +267,16 @@ int ipc_addid(struct ipc_ids* ids, struc rcu_read_lock(); spin_lock(&new->lock); - err = idr_get_new(&ids->ipcs_idr, new, &id); + if (reqid >= 0) { + id = reqid % SEQ_MULTIPLIER; + err = idr_get_new_above(&ids->ipcs_idr, new, id, &id); + if (!err && id != (reqid % SEQ_MULTIPLIER)) { + idr_remove(&ids->ipcs_idr, id); + err = -EEXIST; + } + } else + err = idr_get_new(&ids->ipcs_idr, new, &id); + if (err) { spin_unlock(&new->lock); rcu_read_unlock(); @@ -277,9 +289,13 @@ int ipc_addid(struct ipc_ids* ids, struc new->cuid = new->uid = euid; new->gid = new->cgid = egid; - new->seq = ids->seq++; - if(ids->seq > ids->seq_max) - ids->seq = 0; + if (reqid >= 0) { + new->seq = reqid/SEQ_MULTIPLIER; + } else { + new->seq = ids->seq++; + if(ids->seq > ids->seq_max) + ids->seq = 0; + } new->id = ipc_buildid(id, new->seq); return id; @@ -443,9 +459,9 @@ void* ipc_alloc(int size) { void* out; if(size > PAGE_SIZE) - out = vmalloc(size); + out = ub_vmalloc(size); else - out = kmalloc(size, GFP_KERNEL); + out = kmalloc(size, GFP_KERNEL_UBC); return out; } @@ -528,14 +544,14 @@ void* ipc_rcu_alloc(int size) * workqueue if necessary (for vmalloc). */ if (rcu_use_vmalloc(size)) { - out = vmalloc(HDRLEN_VMALLOC + size); + out = ub_vmalloc(HDRLEN_VMALLOC + size); if (out) { out += HDRLEN_VMALLOC; container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; } } else { - out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); + out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL_UBC); if (out) { out += HDRLEN_KMALLOC; container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; @@ -714,6 +730,7 @@ struct kern_ipc_perm *ipc_lock(struct ip return out; } +EXPORT_SYMBOL_GPL(ipc_lock); struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id) { @@ -800,7 +817,7 @@ struct kern_ipc_perm *ipcctl_pre_down(st euid = current_euid(); if (euid == ipcp->cuid || - euid == ipcp->uid || capable(CAP_SYS_ADMIN)) + euid == ipcp->uid || capable(CAP_VE_SYS_ADMIN)) return ipcp; err = -EPERM; diff -urNp linux-2.6.32.48/ipc/util.h linux-2.6.32.48-openvz/ipc/util.h --- linux-2.6.32.48/ipc/util.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/ipc/util.h 2011-11-21 17:40:47.000000000 -0500 @@ -58,6 +58,7 @@ struct ipc_params { size_t size; /* for shared memories */ int nsems; /* for semaphores */ } u; /* holds the getnew() specific param */ + int id; }; /* @@ -87,14 +88,10 @@ void __init ipc_init_proc_interface(cons #define ipc_init_proc_interface(path, header, ids, show) do {} while (0) #endif -#define IPC_SEM_IDS 0 -#define IPC_MSG_IDS 1 -#define IPC_SHM_IDS 2 - #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) /* must be called with ids->rw_mutex acquired for writing */ -int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); +int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int); /* must be called with ids->rw_mutex acquired for reading */ int ipc_get_maxid(struct ipc_ids *); @@ -121,7 +118,6 @@ void* ipc_rcu_alloc(int size); void ipc_rcu_getref(void *ptr); void ipc_rcu_putref(void *ptr); -struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); @@ -163,12 +159,6 @@ static inline void ipc_lock_by_ptr(struc spin_lock(&perm->lock); } -static inline void ipc_unlock(struct kern_ipc_perm *perm) -{ - spin_unlock(&perm->lock); - rcu_read_unlock(); -} - struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, struct ipc_ops *ops, struct ipc_params *params); diff -urNp linux-2.6.32.48/kernel/audit.c linux-2.6.32.48-openvz/kernel/audit.c --- linux-2.6.32.48/kernel/audit.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/audit.c 2011-11-21 17:40:47.000000000 -0500 @@ -662,6 +662,9 @@ static int audit_receive_msg(struct sk_b char *ctx = NULL; u32 len; + if (!ve_is_super(skb->owner_env)) + return -ECONNREFUSED; + err = audit_netlink_ok(skb, msg_type); if (err) return err; diff -urNp linux-2.6.32.48/kernel/bc/beancounter.c linux-2.6.32.48-openvz/kernel/bc/beancounter.c --- linux-2.6.32.48/kernel/bc/beancounter.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/beancounter.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,715 @@ +/* + * linux/kernel/bc/beancounter.c + * + * Copyright (C) 1998 Alan Cox + * 1998-2000 Andrey V. Savochkin + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * TODO: + * - more intelligent limit check in mremap(): currently the new size is + * charged and _then_ old size is uncharged + * (almost done: !move_vma case is completely done, + * move_vma in its current implementation requires too many conditions to + * do things right, because it may be not only expansion, but shrinking + * also, plus do_munmap will require an additional parameter...) + * - problem: bad pmd page handling + * - consider /proc redesign + * - TCP/UDP ports + * + consider whether __charge_beancounter_locked should be inline + * + * Changes: + * 1999/08/17 Marcelo Tosatti + * - Set "barrier" and "limit" parts of limits atomically. + * 1999/10/06 Marcelo Tosatti + * - setublimit system call. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static struct kmem_cache *ub_cachep; +static struct user_beancounter default_beancounter; +struct user_beancounter ub0; +EXPORT_SYMBOL_GPL(ub0); + +const char *ub_rnames[] = { + "kmemsize", /* 0 */ + "lockedpages", + "privvmpages", + "shmpages", + "dummy", + "numproc", /* 5 */ + "physpages", + "vmguarpages", + "oomguarpages", + "numtcpsock", + "numflock", /* 10 */ + "numpty", + "numsiginfo", + "tcpsndbuf", + "tcprcvbuf", + "othersockbuf", /* 15 */ + "dgramrcvbuf", + "numothersock", + "dcachesize", + "numfile", + "dummy", /* 20 */ + "dummy", + "dummy", + "numiptent", + "swappages", + "unused_privvmpages", /* UB_RESOURCES */ + "tmpfs_respages", + "held_pages", +}; + +static void init_beancounter_struct(struct user_beancounter *ub); +static void init_beancounter_store(struct user_beancounter *ub); +static void init_beancounter_nolimits(struct user_beancounter *ub); + +int print_ub_uid(struct user_beancounter *ub, char *buf, int size) +{ + if (ub->parent != NULL) + return snprintf(buf, size, "%u.%u", + ub->parent->ub_uid, ub->ub_uid); + else + return snprintf(buf, size, "%u", ub->ub_uid); +} +EXPORT_SYMBOL(print_ub_uid); + +#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1)) +#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17) +struct hlist_head ub_hash[UB_HASH_SIZE]; +DEFINE_SPINLOCK(ub_hash_lock); +LIST_HEAD(ub_list_head); /* protected by ub_hash_lock */ +EXPORT_SYMBOL(ub_hash); +EXPORT_SYMBOL(ub_hash_lock); +EXPORT_SYMBOL(ub_list_head); + +/* + * Per user resource beancounting. Resources are tied to their luid. + * The resource structure itself is tagged both to the process and + * the charging resources (a socket doesn't want to have to search for + * things at irq time for example). Reference counters keep things in + * hand. + * + * The case where a user creates resource, kills all his processes and + * then starts new ones is correctly handled this way. The refcounters + * will mean the old entry is still around with resource tied to it. + */ + +static struct user_beancounter *alloc_ub(uid_t uid, struct user_beancounter *p) +{ + struct user_beancounter *new_ub; + + ub_debug(UBD_ALLOC, "Creating ub %p\n", new_ub); + + new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, + GFP_KERNEL); + if (new_ub == NULL) + return NULL; + + if (p == NULL) { + memcpy(new_ub, &default_beancounter, sizeof(*new_ub)); + init_beancounter_struct(new_ub); + } else { + memset(new_ub, 0, sizeof(*new_ub)); + init_beancounter_struct(new_ub); + init_beancounter_nolimits(new_ub); + init_beancounter_store(new_ub); + } + + if (percpu_counter_init(&new_ub->ub_orphan_count, 0)) + goto fail_pcpu; + + new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); + if (new_ub->ub_percpu == NULL) + goto fail_free; + + new_ub->ub_uid = uid; + new_ub->parent = get_beancounter(p); + return new_ub; + +fail_free: + percpu_counter_destroy(&new_ub->ub_orphan_count); +fail_pcpu: + kmem_cache_free(ub_cachep, new_ub); + return NULL; +} + +static inline void __free_ub(struct user_beancounter *ub) +{ + free_percpu(ub->ub_percpu); + kmem_cache_free(ub_cachep, ub); +} + +static inline void free_ub(struct user_beancounter *ub) +{ + percpu_counter_destroy(&ub->ub_orphan_count); + __free_ub(ub); +} + +static inline struct user_beancounter *bc_lookup_hash(struct hlist_head *hash, + uid_t uid, struct user_beancounter *parent) +{ + struct user_beancounter *ub; + struct hlist_node *ptr; + + hlist_for_each_entry (ub, ptr, hash, ub_hash) + if (ub->ub_uid == uid && ub->parent == parent) + return get_beancounter(ub); + + return NULL; +} + +int ub_count; + +/* next two must be called under ub_hash_lock */ +static inline void ub_count_inc(struct user_beancounter *ub) +{ + if (ub->parent) + ub->parent->ub_childs++; + else + ub_count++; +} + +static inline void ub_count_dec(struct user_beancounter *ub) +{ + if (ub->parent) + ub->parent->ub_childs--; + else + ub_count--; +} + +struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) +{ + struct user_beancounter *new_ub, *ub; + unsigned long flags; + struct hlist_head *hash; + + hash = &ub_hash[ub_hash_fun(uid)]; + new_ub = NULL; +retry: + spin_lock_irqsave(&ub_hash_lock, flags); + ub = bc_lookup_hash(hash, uid, NULL); + if (ub != NULL) { + spin_unlock_irqrestore(&ub_hash_lock, flags); + + if (new_ub != NULL) + free_ub(new_ub); + return ub; + } + + if (!create) { + /* no ub found */ + spin_unlock_irqrestore(&ub_hash_lock, flags); + return NULL; + } + + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); + ub_count_inc(new_ub); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } + spin_unlock_irqrestore(&ub_hash_lock, flags); + + new_ub = alloc_ub(uid, NULL); + if (new_ub == NULL) + return NULL; + + goto retry; + +} +EXPORT_SYMBOL(get_beancounter_byuid); + +struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p, + int id, int create) +{ + struct user_beancounter *new_ub, *ub; + unsigned long flags; + struct hlist_head *hash; + + hash = &ub_hash[ub_subhash_fun(p, id)]; + new_ub = NULL; +retry: + spin_lock_irqsave(&ub_hash_lock, flags); + ub = bc_lookup_hash(hash, id, p); + if (ub != NULL) { + spin_unlock_irqrestore(&ub_hash_lock, flags); + + if (new_ub != NULL) { + put_beancounter(new_ub->parent); + free_ub(new_ub); + } + return ub; + } + + if (!create) { + /* no ub found */ + spin_unlock_irqrestore(&ub_hash_lock, flags); + return NULL; + } + + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); + ub_count_inc(new_ub); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } + spin_unlock_irqrestore(&ub_hash_lock, flags); + + new_ub = alloc_ub(id, p); + if (new_ub == NULL) + return NULL; + + goto retry; +} +EXPORT_SYMBOL(get_subbeancounter_byid); + +static void put_warn(struct user_beancounter *ub) +{ + char id[64]; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_ERR "UB: Bad refcount (%d) on put of %s (%p)\n", + atomic_read(&ub->ub_refcount), id, ub); +} + +#ifdef CONFIG_BC_KEEP_UNUSED +#define release_beancounter(ub) do { } while (0) +#else +static int verify_res(struct user_beancounter *ub, int resource, + unsigned long held) +{ + char id[64]; + + if (likely(held == 0)) + return 1; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_WARNING "Ub %s helds %lu in %s on put\n", + id, held, ub_rnames[resource]); + return 0; +} + +static inline void bc_verify_held(struct user_beancounter *ub) +{ + int i, clean; + + clean = 1; + for (i = 0; i < UB_RESOURCES; i++) + clean &= verify_res(ub, i, ub->ub_parms[i].held); + + clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); + clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); + clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); + + ub_debug_trace(!clean, 5, 60*HZ); +} + +static void bc_free_rcu(struct rcu_head *rcu) +{ + struct user_beancounter *ub; + + ub = container_of(rcu, struct user_beancounter, rcu); + __free_ub(ub); +} + +static void delayed_release_beancounter(struct work_struct *w) +{ + struct user_beancounter *ub, *parent; + unsigned long flags; + + ub = container_of(w, struct user_beancounter, cleanup.work); +again: + local_irq_save(flags); + if (!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock)) { + /* raced with get_beancounter_byuid */ + local_irq_restore(flags); + return; + } + + hlist_del(&ub->ub_hash); + ub_count_dec(ub); + list_del_rcu(&ub->ub_list); + spin_unlock_irqrestore(&ub_hash_lock, flags); + + bc_verify_held(ub); + ub_free_counters(ub); + percpu_counter_destroy(&ub->ub_orphan_count); + + parent = ub->parent; + + call_rcu(&ub->rcu, bc_free_rcu); + if (parent) { + ub = parent; + goto again; + } +} + +static inline void release_beancounter(struct user_beancounter *ub) +{ + struct execute_work *ew; + + ew = &ub->cleanup; + INIT_WORK(&ew->work, delayed_release_beancounter); + schedule_work(&ew->work); +} +#endif + +void __put_beancounter(struct user_beancounter *ub) +{ + unsigned long flags; + + /* equevalent to atomic_dec_and_lock_irqsave() */ + local_irq_save(flags); + if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) { + if (unlikely(atomic_read(&ub->ub_refcount) < 0)) + put_warn(ub); + local_irq_restore(flags); + return; + } + + if (unlikely(ub == get_ub0())) { + printk(KERN_ERR "Trying to put ub0\n"); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return; + } + + /* prevent get_beancounter_byuid + put_beancounter() reentrance */ + atomic_inc(&ub->ub_refcount); + spin_unlock_irqrestore(&ub_hash_lock, flags); + + release_beancounter(ub); +} +EXPORT_SYMBOL(__put_beancounter); + +void put_beancounter_safe(struct user_beancounter *ub) +{ + synchronize_rcu(); + __put_beancounter(ub); +} +EXPORT_SYMBOL(put_beancounter_safe); + +/* + * Generic resource charging stuff + */ + +int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict) +{ + ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_parms[resource].held); + /* + * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition + * at the moment is possible so an overflow is impossible. + */ + ub->ub_parms[resource].held += val; + + switch (strict) { + case UB_HARD: + if (ub->ub_parms[resource].held > + ub->ub_parms[resource].barrier) + break; + case UB_SOFT: + if (ub->ub_parms[resource].held > + ub->ub_parms[resource].limit) + break; + case UB_FORCE: + ub_adjust_maxheld(ub, resource); + return 0; + default: + BUG(); + } + + if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl)) + printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n", + ub_rnames[resource], ub->ub_uid); + ub->ub_parms[resource].failcnt++; + ub->ub_parms[resource].held -= val; + return -ENOMEM; +} + +int charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict) +{ + int retval; + struct user_beancounter *p, *q; + unsigned long flags; + + retval = -EINVAL; + if (val > UB_MAXVALUE) + goto out; + + local_irq_save(flags); + for (p = ub; p != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + retval = __charge_beancounter_locked(p, resource, val, strict); + spin_unlock(&p->ub_lock); + if (retval) + goto unroll; + } +out_restore: + local_irq_restore(flags); +out: + return retval; + +unroll: + for (q = ub; q != p; q = q->parent) { + spin_lock(&q->ub_lock); + __uncharge_beancounter_locked(q, resource, val); + spin_unlock(&q->ub_lock); + } + goto out_restore; +} + +EXPORT_SYMBOL(charge_beancounter); + +void __charge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + struct user_beancounter *p; + unsigned long flags; + + local_irq_save(flags); + for (p = ub; p->parent != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + __charge_beancounter_locked(p, resource, val, UB_FORCE); + spin_unlock(&p->ub_lock); + } + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__charge_beancounter_notop); + +void uncharge_warn(struct user_beancounter *ub, int resource, + unsigned long val, unsigned long held) +{ + char id[64]; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n", + val, held, ub_rnames[resource], id); + ub_debug_trace(1, 10, 10*HZ); +} + +void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val) +{ + ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_parms[resource].held); + if (ub->ub_parms[resource].held < val) { + uncharge_warn(ub, resource, + val, ub->ub_parms[resource].held); + val = ub->ub_parms[resource].held; + } + ub->ub_parms[resource].held -= val; +} + +void uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) +{ + unsigned long flags; + struct user_beancounter *p; + + for (p = ub; p != NULL; p = p->parent) { + spin_lock_irqsave(&p->ub_lock, flags); + __uncharge_beancounter_locked(p, resource, val); + spin_unlock_irqrestore(&p->ub_lock, flags); + } +} + +EXPORT_SYMBOL(uncharge_beancounter); + +void __uncharge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + struct user_beancounter *p; + unsigned long flags; + + local_irq_save(flags); + for (p = ub; p->parent != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + __uncharge_beancounter_locked(p, resource, val); + spin_unlock(&p->ub_lock); + } + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__uncharge_beancounter_notop); + + +/* + * Rate limiting stuff. + */ +int ub_ratelimit(struct ub_rate_info *p) +{ + unsigned long cjif, djif; + unsigned long flags; + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + long new_bucket; + + spin_lock_irqsave(&ratelimit_lock, flags); + cjif = jiffies; + djif = cjif - p->last; + if (djif < p->interval) { + if (p->bucket >= p->burst) { + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; + } + p->bucket++; + } else { + new_bucket = p->bucket - (djif / (unsigned)p->interval); + if (new_bucket < 0) + new_bucket = 0; + p->bucket = new_bucket + 1; + } + p->last = cjif; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; +} +EXPORT_SYMBOL(ub_ratelimit); + + +/* + * Initialization + * + * struct user_beancounter contains + * - limits and other configuration settings, + * with a copy stored for accounting purposes, + * - structural fields: lists, spinlocks and so on. + * + * Before these parts are initialized, the structure should be memset + * to 0 or copied from a known clean structure. That takes care of a lot + * of fields not initialized explicitly. + */ + +static void init_beancounter_struct(struct user_beancounter *ub) +{ + ub->ub_magic = UB_MAGIC; + ub->ub_cookie = get_random_int(); + atomic_set(&ub->ub_refcount, 1); + spin_lock_init(&ub->ub_lock); + INIT_LIST_HEAD(&ub->ub_tcp_sk_list); + INIT_LIST_HEAD(&ub->ub_other_sk_list); +#ifdef CONFIG_BC_DEBUG_KMEM + INIT_LIST_HEAD(&ub->ub_cclist); +#endif +} + +static void init_beancounter_store(struct user_beancounter *ub) +{ + int k; + + for (k = 0; k < UB_RESOURCES; k++) { + memcpy(&ub->ub_store[k], &ub->ub_parms[k], + sizeof(struct ubparm)); + } +} + +static void init_beancounter_nolimits(struct user_beancounter *ub) +{ + int k; + + for (k = 0; k < UB_RESOURCES; k++) { + ub->ub_parms[k].limit = UB_MAXVALUE; + /* FIXME: whether this is right for physpages and guarantees? */ + ub->ub_parms[k].barrier = UB_MAXVALUE; + } + + /* FIXME: set unlimited rate? */ + ub->ub_limit_rl.burst = 4; + ub->ub_limit_rl.interval = 300*HZ; +} + +static void init_beancounter_syslimits(struct user_beancounter *ub) +{ + unsigned long mp; + extern int max_threads; + int k; + + mp = num_physpages; + ub->ub_parms[UB_KMEMSIZE].limit = + mp > (192*1024*1024 >> PAGE_SHIFT) ? + 32*1024*1024 : (mp << PAGE_SHIFT) / 6; + ub->ub_parms[UB_LOCKEDPAGES].limit = 8; + ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE; + ub->ub_parms[UB_SHMPAGES].limit = 64; + ub->ub_parms[UB_NUMPROC].limit = max_threads / 2; + ub->ub_parms[UB_NUMTCPSOCK].limit = 1024; + ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */ + ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */ + ub->ub_parms[UB_NUMOTHERSOCK].limit = 256; + ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */ + ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */ + ub->ub_parms[UB_NUMFLOCK].limit = 1024; + ub->ub_parms[UB_NUMPTY].limit = 16; + ub->ub_parms[UB_NUMSIGINFO].limit = 1024; + ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; + ub->ub_parms[UB_NUMFILE].limit = 1024; + ub->ub_parms[UB_SWAPPAGES].limit = UB_MAXVALUE; + + for (k = 0; k < UB_RESOURCES; k++) + ub->ub_parms[k].barrier = ub->ub_parms[k].limit; + + ub->ub_limit_rl.burst = 4; + ub->ub_limit_rl.interval = 300*HZ; +} + +static DEFINE_PER_CPU(struct ub_percpu_struct, ub0_percpu); + +void __init ub_init_early(void) +{ + struct user_beancounter *ub; + + init_cache_counters(); + ub = get_ub0(); + memset(ub, 0, sizeof(*ub)); + ub->ub_uid = 0; + init_beancounter_nolimits(ub); + init_beancounter_store(ub); + init_beancounter_struct(ub); + ub->ub_percpu = &per_cpu__ub0_percpu; + + memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); + (void)set_exec_ub(ub); + current->task_bc.task_ub = get_beancounter(ub); + __charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE); + current->task_bc.fork_sub = get_beancounter(ub); + ub_init_task_bc(¤t->task_bc); + init_mm.mm_ub = get_beancounter(ub); + + hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]); + list_add(&ub->ub_list, &ub_list_head); + ub_count_inc(ub); +} + +void __init ub_init_late(void) +{ + ub_cachep = kmem_cache_create("user_beancounters", + sizeof(struct user_beancounter), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + memset(&default_beancounter, 0, sizeof(default_beancounter)); +#ifdef CONFIG_BC_UNLIMITED + init_beancounter_nolimits(&default_beancounter); +#else + init_beancounter_syslimits(&default_beancounter); +#endif + init_beancounter_store(&default_beancounter); + init_beancounter_struct(&default_beancounter); +} diff -urNp linux-2.6.32.48/kernel/bc/dcache.c linux-2.6.32.48-openvz/kernel/bc/dcache.c --- linux-2.6.32.48/kernel/bc/dcache.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/dcache.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,399 @@ +/* + * kernel/bc/dcache.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Locking + * traverse dcache_lock d_lock + * ub_dentry_charge + - + + * ub_dentry_uncharge + + - + * ub_dentry_charge_nofail + + - + * + * d_inuse changes are atomic, with special handling of "not in use" <-> + * "in use" (-1 <-> 0) transitions. We have two sources of non-atomicity + * here: (1) in many operations we need to change d_inuse of both dentry and + * its parent, and (2) on state transitions we need to adjust the account. + * + * Regarding (1): we do not have (and do not want) a single lock covering all + * operations, so in general it's impossible to get a consistent view of + * a tree with respect to d_inuse counters (except by swsuspend). It also + * means if a dentry with d_inuse of 0 gets one new in-use child and loses + * one, it's d_inuse counter will go either 0 -> 1 -> 0 path or 0 -> -1 -> 0, + * and we can't say which way. + * Note that path -1 -> 0 -> -1 can't turn into -1 -> -2 -> -1, since + * uncharge can be done only after return from charge (with d_genocide being + * the only apparent exception). + * Regarding (2): there is a similar uncertainty with the dcache account. + * If the account is equal to the limit, one more dentry is started to be + * used and one is put, the account will either hit the limit (and an error + * will be returned), or decrement will happen before increment. + * + * These races do not really matter. + * The only things we want are: + * - if a system is suspenede with no in-use dentries, all d_inuse counters + * should be correct (-1); + * - d_inuse counters should always be >= -1. + * This holds if ->parent references are accessed and maintained properly. + * In subtle moments (like d_move) dentries exchanging their parents should + * both be in-use. At d_genocide time, lookups and charges are assumed to be + * impossible. + */ + +/* + * Hierarchical accounting + * UB argument must NOT be NULL + */ + +static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, + enum ub_severity sv) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv)) + goto out_mem; + if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv)) + goto out_dcache; + spin_unlock_irqrestore(&ub->ub_lock, flags); + return 0; + +out_dcache: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); +out_mem: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +static void do_uncharge_dcache(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); + __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static int charge_dcache(struct user_beancounter *ub, unsigned long size, + enum ub_severity sv) +{ + struct user_beancounter *p, *q; + + for (p = ub; p != NULL; p = p->parent) { + if (do_charge_dcache(p, size, sv)) + goto unroll; + } + return 0; + +unroll: + for (q = ub; q != p; q = q->parent) + do_uncharge_dcache(q, size); + return -ENOMEM; +} + +void uncharge_dcache(struct user_beancounter *ub, unsigned long size) +{ + for (; ub != NULL; ub = ub->parent) + do_uncharge_dcache(ub, size); +} + +/* + * Simple helpers to do maintain account and d_ub field. + */ + +static inline int d_charge(struct dentry_beancounter *d_bc) +{ + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + if (charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) { + put_beancounter(ub); + return -1; + } + d_bc->d_ub = ub; + return 0; +} + +static inline void d_forced_charge(struct dentry_beancounter *d_bc) +{ + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + charge_dcache(ub, d_bc->d_ubsize, UB_FORCE); + d_bc->d_ub = ub; +} + +/* + * Minor helpers + */ + +extern struct kmem_cache *dentry_cache; +extern struct kmem_cache *inode_cachep; +static struct rw_semaphore ub_dentry_alloc_sem; + +static inline unsigned long d_charge_size(struct dentry *dentry) +{ + /* dentry's d_name is already set to appropriate value (see d_alloc) */ + return kmem_cache_objuse(inode_cachep) + kmem_cache_objuse(dentry_cache) + + (dname_external(dentry) ? + kmem_dname_objuse((void *)dentry->d_name.name) : 0); +} + +/* + * Entry points from dcache.c + */ + +/* + * Set initial d_inuse on d_alloc. + * Called with no locks, preemption disabled. + */ +int __ub_dentry_alloc(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + + d_bc = &dentry->dentry_bc; + d_bc->d_ub = get_beancounter(get_exec_ub()); + atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in dcache.h */ + d_bc->d_ubsize = d_charge_size(dentry); + + if (charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) + goto failure; + return 0; + +failure: + put_beancounter(d_bc->d_ub); + d_bc->d_ub = NULL; + return -ENOMEM; +} +void __ub_dentry_alloc_start(void) +{ + down_read(&ub_dentry_alloc_sem); + current->task_bc.dentry_alloc = 1; +} + +void __ub_dentry_alloc_end(void) +{ + current->task_bc.dentry_alloc = 0; + up_read(&ub_dentry_alloc_sem); +} + +/* + * It is assumed that parent is already in use, so traverse upwards is + * limited to one ancestor only. + * Called under d_lock and rcu_read_lock. + */ +int __ub_dentry_charge(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + struct dentry *parent; + int ret; + + if (ub_dget_testone(dentry)) { + d_bc = &dentry->dentry_bc; + /* state transition -1 => 0 */ + if (d_charge(d_bc)) + goto failure; + + if (dentry != dentry->d_parent) { + parent = dentry->d_parent; + if (ub_dget_testone(parent)) + BUG(); + } + } + return 0; + +failure: + /* + * Here we would like to fail the lookup. + * It is not easy: if d_lookup fails, callers expect that a dentry + * with the given name doesn't exist, and create a new one. + * So, first we forcedly charge for this dentry. + * Then try to remove it from cache safely. If it turns out to be + * possible, we can return error. + */ + d_forced_charge(d_bc); + + if (dentry != dentry->d_parent) { + parent = dentry->d_parent; + if (ub_dget_testone(parent)) + BUG(); + } + + ret = 0; + if (spin_trylock(&dcache_lock)) { + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + rcu_read_unlock(); + shrink_dcache_parent(dentry); + rcu_read_lock(); + spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + } + if (atomic_read(&dentry->d_count) == 1) { + __d_drop(dentry); + ret = -1; + } + spin_unlock(&dcache_lock); + } + + return ret; +} + +/* + * Go up in the tree decreasing d_inuse. + * Called under dcache_lock. + */ +void __ub_dentry_uncharge(struct dentry *dentry) +{ + struct dentry *parent; + struct user_beancounter *ub; + unsigned long size; + + /* go up until state doesn't change or and root is reached */ + size = dentry->dentry_bc.d_ubsize; + ub = dentry->dentry_bc.d_ub; + while (ub_dput_testzero(dentry)) { + /* state transition 0 => -1 */ + uncharge_dcache(ub, size); + put_beancounter(ub); + + parent = dentry->d_parent; + if (dentry == parent) + break; + + dentry = parent; + size = dentry->dentry_bc.d_ubsize; + ub = dentry->dentry_bc.d_ub; + } +} + +/* + * Forced charge for __dget_locked, where API doesn't allow to return error. + * Called under dcache_lock. + */ +void __ub_dentry_charge_nofail(struct dentry *dentry) +{ + struct dentry *parent; + + while (ub_dget_testone(dentry)) { + /* state transition -1 => 0 */ + d_forced_charge(&dentry->dentry_bc); + + parent = dentry->d_parent; + if (dentry == parent) + break; + dentry = parent; + } +} + +/* + * Adaptive accounting + */ + +int ub_dentry_on = 1; +int ub_dentry_alloc_barrier; +EXPORT_SYMBOL(ub_dentry_on); + +static unsigned long checklowat = 0; +static unsigned long checkhiwat = ULONG_MAX; + +static int sysctl_ub_dentry_chk = 10; +#define sysctl_ub_lowat sysctl_ub_watermark[0] +#define sysctl_ub_hiwat sysctl_ub_watermark[1] +static DECLARE_RWSEM(ub_dentry_alloc_sem); +/* 1024th of lowmem size */ +static unsigned int sysctl_ub_watermark[2] = {0, 100}; + +static void ub_dentry_set_limits(unsigned long pages, unsigned long cap) +{ + down_write(&ub_dentry_alloc_sem); + preempt_disable(); + checklowat = (pages >> 10) * sysctl_ub_lowat; + checkhiwat = (pages >> 10) * sysctl_ub_hiwat; + if (checkhiwat > cap) { + checkhiwat = cap; + checklowat = cap / sysctl_ub_hiwat * sysctl_ub_lowat; + } + preempt_enable(); + up_write(&ub_dentry_alloc_sem); +} + +static int ub_dentry_proc_handler(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + + r = proc_dointvec(ctl, write, buffer, lenp, ppos); + if (!r && write) + ub_dentry_set_limits(totalram_pages - totalhigh_pages, + ULONG_MAX); + return r; +} + +static ctl_table ub_dentry_sysctl_table[] = { + { + .procname = "dentry_check", + .data = &sysctl_ub_dentry_chk, + .maxlen = sizeof(sysctl_ub_dentry_chk), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "dentry_watermark", + .data = &sysctl_ub_lowat, + .maxlen = sizeof(sysctl_ub_lowat) * 2, + .mode = 0644, + .proc_handler = ub_dentry_proc_handler, + }, + { .ctl_name = 0 } +}; +static ctl_table ub_dentry_sysctl_root[] = { + { + .procname = "ubc", + .mode = 0555, + .child = ub_dentry_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static int __init ub_dentry_init(void) +{ + /* + * Initial watermarks are limited, to limit walk time. + * 384MB translates into 0.8 sec on PIII 866MHz. + */ + ub_dentry_set_limits(totalram_pages - totalhigh_pages, + 384 * 1024 * 1024 / PAGE_SIZE); + if (register_sysctl_table(ub_dentry_sysctl_root) == NULL) + return -ENOMEM; + return 0; +} +__initcall(ub_dentry_init); diff -urNp linux-2.6.32.48/kernel/bc/io_acct.c linux-2.6.32.48-openvz/kernel/bc/io_acct.c --- linux-2.6.32.48/kernel/bc/io_acct.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/io_acct.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,501 @@ +/* + * kernel/bc/io_acct.c + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Pavel Emelianov + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct mempool_s *pb_pool; + +#define PB_MIN_IO (1024) + +static inline struct page_beancounter *io_pb_alloc(void) +{ + return mempool_alloc(pb_pool, GFP_ATOMIC); +} + +static inline void io_pb_free(struct page_beancounter *pb) +{ + mempool_free(pb, pb_pool); +} + +struct page_beancounter **page_pblist(struct page *page) +{ + struct page_beancounter **pb, *iopb; + + pb = &page_pbc(page); + iopb = iopb_to_pb(*pb); + + return iopb == NULL ? pb : &iopb->page_pb_list; +} + +/* + * We save the context page was set dirty to use it later + * when the real write starts. If the page is mapped then + * IO pb is stores like this: + * + * Before saving: + * + * +- page -------+ + * | ... | + * | page_pb +---+ + * +--------------+ | +-----+ +-----+ +-----+ + * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ + * +-----+ +-----+ +-----+ | + * ^ | + * +---------------------------------+ + * + * After saving: + * + * +- page -------+ +- io pb ------+ + * | ... | | ... | + * | page_pb +----> | page_pb_list +-+ + * +--------------+ +--------------+ | + * | + * +-------------------+ + * | + * | +-----+ +-----+ +-----+ + * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ + * +-----+ +-----+ +-----+ | + * ^ | + * +---------------------------------+ + * + * And the page_pblist(...) function returns pointer to the place that + * points to this pbX ring. + */ + +#ifdef CONFIG_BC_DEBUG_IO +static LIST_HEAD(pb_io_list); +static unsigned long anon_pages, not_released; + +static inline void io_debug_save(struct page_beancounter *pb, + struct page_beancounter *mpb) +{ + pb->io_debug = (mpb == NULL); + list_add(&pb->io_list, &pb_io_list); +} + +static inline void io_debug_release(struct page_beancounter *pb) +{ + list_del(&pb->io_list); +} + +void ub_io_release_debug(struct page *page) +{ + struct page_beancounter *pb; + static int once = 0; + + pb = page_pbc(page); + if (likely(iopb_to_pb(pb) == NULL)) + return; + + if (!once) { + printk("BUG: Page has an IO bc but is not expectd to\n"); + dump_stack(); + once = 1; + } + + spin_lock(&pb_lock); + not_released++; + pb = iopb_to_pb(pb); + page_pbc(page) = NULL; + io_debug_release(pb); + pb->ub->io_pb_held--; + spin_unlock(&pb_lock); + + put_beancounter(pb->ub); + io_pb_free(pb); +} + +static inline int io_debug_precheck_save(struct page *page) +{ + if (unlikely(PageAnon(page))) { + anon_pages++; + return 1; + } + + return 0; +} + +static inline int io_debug_precheck_release(struct page *page) +{ + return 0; +} +#else +#define io_debug_save(pb, mpb) do { } while (0) +#define io_debug_release(pb) do { } while (0) +#define io_debug_precheck_save(page) (0) +#define io_debug_precheck_release(p) (0) +#endif + +static inline void set_page_io(struct page *page, struct page_beancounter *pb, + struct page_beancounter *mapped_pb) +{ + unsigned long val; + + val = (unsigned long)pb | PAGE_IO_MARK; + pb->page = page; + + page_pbc(page) = (struct page_beancounter *)val; + io_debug_save(pb, mapped_pb); + pb->ub->io_pb_held++; +} + +static inline void put_page_io(struct page *page, struct page_beancounter *pb) +{ + pb->ub->io_pb_held--; + io_debug_release(pb); + page_pbc(page) = pb->page_pb_list; +} + +void ub_io_save_context(struct page *page, size_t bytes_dirtied) +{ + struct user_beancounter *ub; + struct page_beancounter *pb, *mapped_pb, *io_pb; + + if (unlikely(in_interrupt())) { + WARN_ON_ONCE(1); + return; + } + + /* + * FIXME - this can happen from atomic context and + * it's probably not that good to loose some requests + */ + + pb = io_pb_alloc(); + io_pb = NULL; + + spin_lock(&pb_lock); + if (io_debug_precheck_save(page)) + goto out_unlock; + + mapped_pb = page_pbc(page); + io_pb = iopb_to_pb(mapped_pb); + if (io_pb != NULL) { + /* + * this page has an IO - release it and force a new one + * We could also race with page cleaning - see below + */ + mapped_pb = io_pb->page_pb_list; + put_page_io(page, io_pb); + } + + /* + * If the page is mapped we must save the context + * it maps to. If the page isn't mapped we use current + * context as this is a regular write. + */ + + if (mapped_pb != NULL) + ub = top_beancounter(mapped_pb->ub); + else + ub = get_io_ub(); + + if (!PageDirty(page)) { + /* + * race with clear_page_dirty(_for_io) - account + * writes for ub_io_release_context() + */ + if (io_pb != NULL) + io_pb->ub->bytes_wrote += PAGE_CACHE_SIZE; + if (pb != NULL) + io_pb_free(pb); + goto out_unlock; + } + + if (pb == NULL) { + ub->bytes_dirty_missed += bytes_dirtied; + goto out_unlock; + } + + /* + * the page may become clean here, but the context will be seen + * in ub_io_release_context() + */ + + pb->ub = get_beancounter(ub); + pb->page_pb_list = mapped_pb; + ub->bytes_dirtied += bytes_dirtied; + + set_page_io(page, pb, mapped_pb); + +out_unlock: + spin_unlock(&pb_lock); + + if (io_pb != NULL) { + put_beancounter(io_pb->ub); + io_pb_free(io_pb); + } +} + +void ub_io_release_context(struct page *page, size_t wrote) +{ + struct page_beancounter *pb; + + if (io_debug_precheck_release(page)) + return; + + if (unlikely(in_interrupt())) { + WARN_ON_ONCE(1); + return; + } + + spin_lock(&pb_lock); + pb = iopb_to_pb(page_pbc(page)); + if (unlikely(pb == NULL)) + /* + * this may happen if we failed to allocate + * context in ub_io_save_context or raced with it + */ + goto out_unlock; + + if (wrote) + pb->ub->bytes_wrote += wrote; + + put_page_io(page, pb); +out_unlock: + spin_unlock(&pb_lock); + + if (pb != NULL) { + put_beancounter(pb->ub); + io_pb_free(pb); + } +} + +void __init ub_init_io(struct kmem_cache *pb_cachep) +{ + pb_pool = mempool_create_slab_pool(PB_MIN_IO, pb_cachep); + if (pb_pool == NULL) + panic("Can't create pb_pool"); +} + +#ifdef CONFIG_PROC_FS +#define in_flight(var) (var > var##_done ? var - var##_done : 0) + +static int bc_ioacct_show(struct seq_file *f, void *v) +{ + int i; + unsigned long long read, write, cancel; + unsigned long sync, sync_done; + unsigned long fsync, fsync_done; + unsigned long fdsync, fdsync_done; + unsigned long frsync, frsync_done; + unsigned long reads, writes; + unsigned long long rchar, wchar; + struct user_beancounter *ub; + + ub = seq_beancounter(f); + + read = write = cancel = 0; + sync = sync_done = fsync = fsync_done = + fdsync = fdsync_done = frsync = frsync_done = 0; + reads = writes = 0; + rchar = wchar = 0; + for_each_online_cpu(i) { + struct ub_percpu_struct *ub_percpu; + ub_percpu = per_cpu_ptr(ub->ub_percpu, i); + + read += ub_percpu->bytes_read; + write += ub_percpu->bytes_wrote; + cancel += ub_percpu->bytes_cancelled; + + sync += ub_percpu->sync; + fsync += ub_percpu->fsync; + fdsync += ub_percpu->fdsync; + frsync += ub_percpu->frsync; + sync_done += ub_percpu->sync_done; + fsync_done += ub_percpu->fsync_done; + fdsync_done += ub_percpu->fdsync_done; + frsync_done += ub_percpu->frsync_done; + + reads += ub_percpu->read; + writes += ub_percpu->write; + rchar += ub_percpu->rchar; + wchar += ub_percpu->wchar; + } + + seq_printf(f, bc_proc_llu_fmt, "read", read); + seq_printf(f, bc_proc_llu_fmt, "write", ub->bytes_wrote + write); + seq_printf(f, bc_proc_llu_fmt, "dirty", ub->bytes_dirtied); + seq_printf(f, bc_proc_llu_fmt, "cancel", cancel); + seq_printf(f, bc_proc_llu_fmt, "missed", ub->bytes_dirty_missed); + + seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync); + seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync); + seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync); + seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync); + + seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync)); + seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync)); + seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync)); + seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync)); + + seq_printf(f, bc_proc_lu_lfmt, "vfs_reads", reads); + seq_printf(f, bc_proc_llu_fmt, "vfs_read_chars", rchar); + seq_printf(f, bc_proc_lu_lfmt, "vfs_writes", writes); + seq_printf(f, bc_proc_llu_fmt, "vfs_write_chars", wchar); + + seq_printf(f, bc_proc_lu_lfmt, "io_pbs", ub->io_pb_held); + return 0; +} + +static struct bc_proc_entry bc_ioacct_entry = { + .name = "ioacct", + .u.show = bc_ioacct_show, +}; + +#ifdef CONFIG_BC_DEBUG_IO +#define PTR_SIZE (int)(sizeof(void *) * 2) +#define INT_SIZE (int)(sizeof(int) * 2) + +static int bc_io_show(struct seq_file *f, void *v) +{ + struct list_head *lh; + struct page_beancounter *pb; + struct page *pg; + + lh = (struct list_head *)v; + if (lh == &pb_io_list) { + seq_printf(f, "Races: anon %lu missed %lu\n", + anon_pages, not_released); + + seq_printf(f, "%-*s %-1s %-*s %-4s %*s %*s " + "%-*s %-*s %-1s %-*s %-*s\n", + PTR_SIZE, "pb", "", + PTR_SIZE, "page", "flg", + INT_SIZE, "cnt", INT_SIZE, "mcnt", + PTR_SIZE, "pb_list", + PTR_SIZE, "page_pb", "", + PTR_SIZE, "mapping", + INT_SIZE, "ub"); + return 0; + } + + pb = list_entry(lh, struct page_beancounter, io_list); + pg = pb->page; + seq_printf(f, "%p %c %p %c%c%c%c %*d %*d %p %p %c %p %d\n", + pb, pb->io_debug ? 'e' : 'm', pg, + PageDirty(pg) ? 'D' : 'd', + PageAnon(pg) ? 'A' : 'a', + PageWriteback(pg) ? 'W' : 'w', + PageLocked(pg) ? 'L' : 'l', + INT_SIZE, page_count(pg), + INT_SIZE, page_mapcount(pg), + pb->page_pb_list, page_pbc(pg), + iopb_to_pb(page_pbc(pg)) == pb ? ' ' : '!', + pg->mapping, pb->ub->ub_uid); + return 0; +} + +static void *bc_io_start(struct seq_file *f, loff_t *ppos) +{ + spin_lock(&pb_lock); + return seq_list_start_head(&pb_io_list, *ppos); +} + +static void *bc_io_next(struct seq_file *f, void *v, loff_t *ppos) +{ + return seq_list_next(v, &pb_io_list, ppos); +} + +static void bc_io_stop(struct seq_file *f, void *v) +{ + spin_unlock(&pb_lock); +} + +static struct seq_operations bc_io_seq_ops = { + .start = bc_io_start, + .next = bc_io_next, + .stop = bc_io_stop, + .show = bc_io_show, +}; + +static int bc_io_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &bc_io_seq_ops); +} +static struct file_operations bc_io_debug_ops = { + .open = bc_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct bc_proc_entry bc_ioacct_debug_entry = { + .name = "ioacct_debug", + .u.fops = &bc_io_debug_ops, +}; +#endif + +static int bc_ioacct_notify(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + struct user_beancounter *ub; + unsigned long *vm_events; + unsigned long long bin, bout; + int i; + + if (event != VIRTINFO_VMSTAT) + return old_ret; + + ub = top_beancounter(get_exec_ub()); + if (ub == get_ub0()) + return old_ret; + + /* Think over: do we need to account here bytes_dirty_missed? */ + bout = ub->bytes_wrote; + bin = 0; + for_each_online_cpu(i) { + bout += per_cpu_ptr(ub->ub_percpu, i)->bytes_wrote; + bin += per_cpu_ptr(ub->ub_percpu, i)->bytes_read; + } + + /* convert to Kbytes */ + bout >>= 10; + bin >>= 10; + + vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS; + vm_events[PGPGOUT] = (unsigned long)bout; + vm_events[PGPGIN] = (unsigned long)bin; + return NOTIFY_OK; +} + +static struct vnotifier_block bc_ioacct_nb = { + .notifier_call = bc_ioacct_notify, +}; + +static int __init bc_ioacct_init(void) +{ +#ifdef CONFIG_BC_DEBUG_IO + bc_register_proc_root_entry(&bc_ioacct_debug_entry); +#endif + bc_register_proc_entry(&bc_ioacct_entry); + + virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb); + return 0; +} + +late_initcall(bc_ioacct_init); +#endif diff -urNp linux-2.6.32.48/kernel/bc/Kconfig linux-2.6.32.48-openvz/kernel/bc/Kconfig --- linux-2.6.32.48/kernel/bc/Kconfig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/Kconfig 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,103 @@ +# +# User resources part (UBC) +# +# Copyright (C) 2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +menu "User resources" + +config BEANCOUNTERS + bool "Enable user resource accounting" + default y + help + This patch provides accounting and allows to configure + limits for user's consumption of exhaustible system resources. + The most important resource controlled by this patch is unswappable + memory (either mlock'ed or used by internal kernel structures and + buffers). The main goal of this patch is to protect processes + from running short of important resources because of an accidental + misbehavior of processes or malicious activity aiming to ``kill'' + the system. It's worth to mention that resource limits configured + by setrlimit(2) do not give an acceptable level of protection + because they cover only small fraction of resources and work on a + per-process basis. Per-process accounting doesn't prevent malicious + users from spawning a lot of resource-consuming processes. + +config BC_RSS_ACCOUNTING + bool "Account physical memory usage" + default y + depends on BEANCOUNTERS + help + This allows to estimate per beancounter physical memory usage. + Implemented alghorithm accounts shared pages of memory as well, + dividing them by number of beancounter which use the page. + +config BC_IO_ACCOUNTING + bool "Account disk IO" + default y + depends on BC_RSS_ACCOUNTING + help + When on this option allows seeing disk IO activity caused by + tasks from each UB + +config BC_SWAP_ACCOUNTING + bool "Account swap usage" + default y + depends on BEANCOUNTERS + help + This allows accounting of swap usage. + +config BC_PROC + bool "Report resource usage in /proc" + default y + depends on BEANCOUNTERS + help + Allows a system administrator to inspect resource accounts and limits. + +config BC_DEBUG + bool "User resources debug features" + default n + depends on BEANCOUNTERS + help + Enables to setup debug features for user resource accounting + +config BC_DEBUG_IO + bool "Debug IO accounting" + default y + depends on BC_DEBUG && BC_IO_ACCOUNTING + help + Debugging for IO accointing. + +config BC_DEBUG_KMEM + bool "Debug kmemsize with cache counters" + default n + depends on BC_DEBUG + help + Adds /proc/user_beancounters_debug entry to get statistics + about cache usage of each beancounter + +config BC_KEEP_UNUSED + bool "Keep unused beancounter alive" + default y + depends on BC_DEBUG + help + If on, unused beancounters are kept on the hash and maxheld value + can be looked through. + +config BC_DEBUG_ITEMS + bool "Account resources in items rather than in bytes" + default y + depends on BC_DEBUG + help + When true some of the resources (e.g. kmemsize) are accounted + in items instead of bytes. + +config BC_UNLIMITED + bool "Use unlimited ubc settings" + default y + depends on BC_DEBUG + help + When ON all limits and barriers are set to max values. +endmenu diff -urNp linux-2.6.32.48/kernel/bc/kmem.c linux-2.6.32.48-openvz/kernel/bc/kmem.c --- linux-2.6.32.48/kernel/bc/kmem.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/kmem.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,405 @@ +/* + * kernel/bc/kmem.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Initialization + */ + +/* + * Slab accounting + */ + +#ifdef CONFIG_BC_DEBUG_KMEM + +#define CC_HASH_SIZE 1024 +static struct ub_cache_counter *cc_hash[CC_HASH_SIZE]; +spinlock_t cc_lock; + +static void __free_cache_counters(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + struct ub_cache_counter *cc, **pprev, *del; + int i; + unsigned long flags; + + del = NULL; + spin_lock_irqsave(&cc_lock, flags); + for (i = 0; i < CC_HASH_SIZE; i++) { + pprev = &cc_hash[i]; + cc = cc_hash[i]; + while (cc != NULL) { + if (cc->ub != ub && cc->cachep != cachep) { + pprev = &cc->next; + cc = cc->next; + continue; + } + + list_del(&cc->ulist); + *pprev = cc->next; + cc->next = del; + del = cc; + cc = *pprev; + } + } + spin_unlock_irqrestore(&cc_lock, flags); + + while (del != NULL) { + cc = del->next; + kfree(del); + del = cc; + } +} + +void ub_free_counters(struct user_beancounter *ub) +{ + __free_cache_counters(ub, NULL); +} + +void ub_kmemcache_free(struct kmem_cache *cachep) +{ + __free_cache_counters(NULL, cachep); +} + +void __init init_cache_counters(void) +{ + memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0])); + spin_lock_init(&cc_lock); +} + +#define cc_hash_fun(ub, cachep) ( \ + (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \ + ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \ + ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \ + ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \ + ) & (CC_HASH_SIZE - 1)) + +static int change_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep, long val) +{ + struct ub_cache_counter *cc, *new_cnt, **pprev; + unsigned long flags; + + new_cnt = NULL; +again: + spin_lock_irqsave(&cc_lock, flags); + cc = cc_hash[cc_hash_fun(ub, cachep)]; + while (cc) { + if (cc->ub == ub && cc->cachep == cachep) + goto found; + cc = cc->next; + } + + if (new_cnt != NULL) + goto insert; + + spin_unlock_irqrestore(&cc_lock, flags); + + new_cnt = kmalloc(sizeof(*new_cnt), GFP_ATOMIC); + if (new_cnt == NULL) + return -ENOMEM; + + new_cnt->counter = 0; + new_cnt->ub = ub; + new_cnt->cachep = cachep; + goto again; + +insert: + pprev = &cc_hash[cc_hash_fun(ub, cachep)]; + new_cnt->next = *pprev; + *pprev = new_cnt; + list_add(&new_cnt->ulist, &ub->ub_cclist); + cc = new_cnt; + new_cnt = NULL; + +found: + cc->counter += val; + spin_unlock_irqrestore(&cc_lock, flags); + if (new_cnt) + kfree(new_cnt); + return 0; +} + +static inline int inc_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + return change_slab_charged(ub, cachep, 1); +} + +static inline void dec_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + if (change_slab_charged(ub, cachep, -1) < 0) + BUG(); +} + +#include + +#define inc_pages_charged(ub, order) ub_percpu_add(ub, \ + pages_charged, 1 << order) +#define dec_pages_charged(ub, order) ub_percpu_sub(ub, \ + pages_charged, 1 << order) + +#ifdef CONFIG_PROC_FS +static int bc_kmem_debug_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + struct ub_cache_counter *cc; + long pages, vmpages; + int i; + + ub = seq_beancounter(f); + + pages = vmpages = 0; + for_each_online_cpu(i) { + pages += per_cpu_ptr(ub->ub_percpu, i)->pages_charged; + vmpages += per_cpu_ptr(ub->ub_percpu, i)->vmalloc_charged; + } + if (pages < 0) + pages = 0; + if (vmpages < 0) + vmpages = 0; + + seq_printf(f, bc_proc_lu_lu_fmt, "pages", pages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "vmalloced", vmpages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", ub->ub_pbcs, + sizeof(struct page_beancounter)); + + spin_lock_irq(&cc_lock); + list_for_each_entry (cc, &ub->ub_cclist, ulist) { + struct kmem_cache *cachep; + + cachep = cc->cachep; + seq_printf(f, bc_proc_lu_lu_fmt, + kmem_cache_name(cachep), + cc->counter, + kmem_cache_objuse(cachep)); + } + spin_unlock_irq(&cc_lock); + return 0; +} + +static struct bc_proc_entry bc_kmem_debug_entry = { + .name = "kmem_debug", + .u.show = bc_kmem_debug_show, +}; + +static int __init bc_kmem_debug_init(void) +{ + bc_register_proc_entry(&bc_kmem_debug_entry); + return 0; +} + +late_initcall(bc_kmem_debug_init); +#endif + +#else +#define inc_slab_charged(ub, cache) (0) +#define dec_slab_charged(ub, cache) do { } while (0) +#define inc_pages_charged(ub, cache) do { } while (0) +#define dec_pages_charged(ub, cache) do { } while (0) +#endif + +#define UB_KMEM_QUANT (PAGE_SIZE * 4) + +/* called with IRQ disabled */ +int ub_kmemsize_charge(struct user_beancounter *ub, + unsigned long size, + enum ub_severity strict) +{ + struct task_beancounter *tbc; + + tbc = ¤t->task_bc; + if (ub != tbc->task_ub || size > UB_KMEM_QUANT) + goto just_charge; + if (tbc->kmem_precharged >= size) { + tbc->kmem_precharged -= size; + return 0; + } + + if (charge_beancounter(ub, UB_KMEMSIZE, UB_KMEM_QUANT, UB_HARD) == 0) { + tbc->kmem_precharged += UB_KMEM_QUANT - size; + return 0; + } + +just_charge: + return charge_beancounter(ub, UB_KMEMSIZE, size, strict); +} + +/* called with IRQ disabled */ +void ub_kmemsize_uncharge(struct user_beancounter *ub, + unsigned long size) +{ + struct task_beancounter *tbc; + + if (size > UB_MAXVALUE) { + printk("ub_kmemsize_uncharge: size %lu\n", size); + dump_stack(); + } + + tbc = ¤t->task_bc; + if (ub != tbc->task_ub) + goto just_uncharge; + + tbc->kmem_precharged += size; + if (tbc->kmem_precharged < UB_KMEM_QUANT * 2) + return; + size = tbc->kmem_precharged - UB_KMEM_QUANT; + tbc->kmem_precharged -= size; + +just_uncharge: + uncharge_beancounter(ub, UB_KMEMSIZE, size); +} + +/* called with IRQ disabled */ +int ub_slab_charge(struct kmem_cache *cachep, void *objp, gfp_t flags) +{ + unsigned int size; + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + if (ub == NULL) + return 0; + + size = CHARGE_SIZE(kmem_cache_objuse(cachep)); + if (ub_kmemsize_charge(ub, size, + (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) + goto out_err; + + if (inc_slab_charged(ub, cachep) < 0) { + ub_kmemsize_uncharge(ub, size); + goto out_err; + } + *ub_slab_ptr(cachep, objp) = ub; + return 0; + +out_err: + put_beancounter(ub); + return -ENOMEM; +} + +/* called with IRQ disabled */ +void ub_slab_uncharge(struct kmem_cache *cachep, void *objp) +{ + unsigned int size; + struct user_beancounter **ub_ref; + + ub_ref = ub_slab_ptr(cachep, objp); + if (*ub_ref == NULL) + return; + + dec_slab_charged(*ub_ref, cachep); + size = CHARGE_SIZE(kmem_cache_objuse(cachep)); + ub_kmemsize_uncharge(*ub_ref, size); + put_beancounter(*ub_ref); + *ub_ref = NULL; +} + +/* + * Pages accounting + */ + +int ub_page_charge(struct page *page, int order, gfp_t mask) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = NULL; + if (!(mask & __GFP_UBC)) + goto out; + + ub = get_beancounter(get_exec_ub()); + if (ub == NULL) + goto out; + + local_irq_save(flags); + if (ub_kmemsize_charge(ub, CHARGE_ORDER(order), + (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) + goto err; + + inc_pages_charged(ub, order); + local_irq_restore(flags); +out: + BUG_ON(page_ub(page) != NULL); + page_ub(page) = ub; + return 0; + +err: + local_irq_restore(flags); + BUG_ON(page_ub(page) != NULL); + put_beancounter(ub); + return -ENOMEM; +} + +void ub_page_uncharge(struct page *page, int order) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = page_ub(page); + if (ub == NULL) + return; + + BUG_ON(ub->ub_magic != UB_MAGIC); + dec_pages_charged(ub, order); + local_irq_save(flags); + ub_kmemsize_uncharge(ub, CHARGE_ORDER(order)); + local_irq_restore(flags); + put_beancounter(ub); + page_ub(page) = NULL; +} + +/* + * takes init_mm.page_table_lock + * some outer lock to protect pages from vmalloced area must be held + */ +struct user_beancounter *vmalloc_ub(void *obj) +{ + struct page *pg; + + pg = vmalloc_to_page(obj); + if (pg == NULL) + return NULL; + + return page_ub(pg); +} + +EXPORT_SYMBOL(vmalloc_ub); + +struct user_beancounter *mem_ub(void *obj) +{ + struct user_beancounter *ub; + + if ((unsigned long)obj >= VMALLOC_START && + (unsigned long)obj < VMALLOC_END) + ub = vmalloc_ub(obj); + else + ub = slab_ub(obj); + + return ub; +} + +EXPORT_SYMBOL(mem_ub); diff -urNp linux-2.6.32.48/kernel/bc/Makefile linux-2.6.32.48-openvz/kernel/bc/Makefile --- linux-2.6.32.48/kernel/bc/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/Makefile 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,15 @@ +# +# User resources part (UBC) +# +# Copyright (C) 2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-y := sys.o beancounter.o dcache.o kmem.o misc.o \ + vm_pages.o statd.o oom_kill.o + +obj-$(CONFIG_NET) += net.o +obj-$(CONFIG_BC_RSS_ACCOUNTING) += rss_pages.o +obj-$(CONFIG_BC_PROC) += proc.o +obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o diff -urNp linux-2.6.32.48/kernel/bc/misc.c linux-2.6.32.48-openvz/kernel/bc/misc.c --- linux-2.6.32.48/kernel/bc/misc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/misc.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,460 @@ +/* + * kernel/bc/misc.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define UB_FILE_MINQUANT 3 +#define UB_FILE_MAXQUANT 10 +#define UB_FILE_INIQUANT 4 + +static unsigned long ub_file_precharge(struct task_beancounter *task_bc, + struct user_beancounter *ub, unsigned long *kmemsize); + +static inline unsigned long ub_file_kmemsize(unsigned long nr) +{ + return CHARGE_SIZE(kmem_cache_objuse(filp_cachep)) * nr; +} + +/* + * Task staff + */ + +static void init_task_sub(struct task_struct *parent, + struct task_struct *tsk, + struct task_beancounter *old_bc) +{ + struct task_beancounter *new_bc; + struct user_beancounter *sub; + + new_bc = &tsk->task_bc; + sub = old_bc->fork_sub; + new_bc->fork_sub = get_beancounter(sub); + new_bc->task_fnode = NULL; + new_bc->task_freserv = old_bc->task_freserv; + old_bc->task_freserv = NULL; + memset(&new_bc->task_data, 0, sizeof(new_bc->task_data)); + new_bc->pgfault_handle = 0; + new_bc->pgfault_allot = 0; +} + +void ub_init_task_bc(struct task_beancounter *tbc) +{ + tbc->file_precharged = 0; + tbc->file_quant = UB_FILE_INIQUANT; + tbc->file_count = 0; + + tbc->kmem_precharged = 0; + tbc->dentry_alloc = 0; +} + +int ub_task_charge(struct task_struct *parent, struct task_struct *task) +{ + struct task_beancounter *old_bc; + struct task_beancounter *new_bc; + struct user_beancounter *ub, *pub; + unsigned long file_nr, kmemsize; + unsigned long flags; + + old_bc = &parent->task_bc; + ub = old_bc->fork_sub; + new_bc = &task->task_bc; + new_bc->task_ub = get_beancounter(ub); + new_bc->exec_ub = get_beancounter(ub); + + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); + if (unlikely(__charge_beancounter_locked(pub, UB_NUMPROC, + 1, UB_HARD) < 0)) + goto out_numproc; + + ub_init_task_bc(new_bc); + file_nr = ub_file_precharge(new_bc, pub, &kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + + charge_beancounter_notop(ub, UB_NUMPROC, 1); + if (likely(file_nr)) { + charge_beancounter_notop(ub, UB_NUMFILE, file_nr); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmemsize); + } + + init_task_sub(parent, task, old_bc); + return 0; + +out_numproc: + spin_unlock_irqrestore(&pub->ub_lock, flags); + __put_beancounter_batch(ub, 2); + return -ENOMEM; +} + +extern atomic_t dbgpre; + +void ub_task_uncharge(struct task_struct *task) +{ + struct task_beancounter *task_bc; + struct user_beancounter *pub; + unsigned long file_nr, file_kmemsize; + unsigned long flags; + + task_bc = &task->task_bc; + pub = top_beancounter(task_bc->task_ub); + spin_lock_irqsave(&pub->ub_lock, flags); + __uncharge_beancounter_locked(pub, UB_NUMPROC, 1); + file_nr = task_bc->file_precharged; + if (likely(file_nr)) + __uncharge_beancounter_locked(pub, + UB_NUMFILE, file_nr); + + /* see comment in ub_file_charge */ + task_bc->file_precharged = 0; + file_kmemsize = ub_file_kmemsize(file_nr); + if (likely(file_kmemsize)) + __uncharge_beancounter_locked(pub, + UB_KMEMSIZE, file_kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + + uncharge_beancounter_notop(task_bc->task_ub, UB_NUMPROC, 1); + if (likely(file_nr)) { + uncharge_beancounter_notop(task_bc->task_ub, + UB_NUMFILE, file_nr); + __put_beancounter_batch(task_bc->task_ub, file_nr); + } + if (likely(file_kmemsize)) + uncharge_beancounter_notop(task_bc->task_ub, + UB_KMEMSIZE, file_kmemsize); +} + +void ub_task_put(struct task_struct *task) +{ + struct task_beancounter *task_bc; + struct user_beancounter *pub; + unsigned long kmemsize, flags; + + task_bc = &task->task_bc; + + pub = top_beancounter(task_bc->task_ub); + spin_lock_irqsave(&pub->ub_lock, flags); + kmemsize = task_bc->kmem_precharged; + task_bc->kmem_precharged = 0; + if (likely(kmemsize)) + __uncharge_beancounter_locked(pub, UB_KMEMSIZE, kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + if (likely(kmemsize)) + uncharge_beancounter_notop(task_bc->task_ub, UB_KMEMSIZE, kmemsize); + + put_beancounter(task_bc->exec_ub); + put_beancounter(task_bc->task_ub); + put_beancounter(task_bc->fork_sub); + /* can't be freed elsewhere, failures possible in the middle of fork */ + if (task_bc->task_freserv != NULL) + kfree(task_bc->task_freserv); + + task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc; + task_bc->task_ub = (struct user_beancounter *)0xdead100c; + BUG_ON(task_bc->kmem_precharged != 0); +} + +/* + * Files and file locks. + */ +/* + * For NUMFILE, we do not take a lock and call charge function + * for every file. We try to charge in batches, keeping local reserve on + * task. For experimental purposes, batch size is adaptive and depends + * on numfile barrier, number of processes, and the history of successes and + * failures of batch charges. + * + * Per-task fields have the following meaning + * file_precharged number of files charged to beancounter in advance, + * file_quant logarithm of batch size + * file_count counter of charge successes, to reduce batch size + * fluctuations. + */ +static unsigned long ub_file_precharge(struct task_beancounter *task_bc, + struct user_beancounter *ub, unsigned long *kmemsize) +{ + unsigned long n, kmem; + + n = 1UL << task_bc->file_quant; + if (ub->ub_parms[UB_NUMPROC].held > + (ub->ub_parms[UB_NUMFILE].barrier >> + task_bc->file_quant)) + goto nopre; + if (unlikely(__charge_beancounter_locked(ub, UB_NUMFILE, n, UB_HARD))) + goto nopre; + kmem = ub_file_kmemsize(n); + if (unlikely(__charge_beancounter_locked(ub, UB_KMEMSIZE, + kmem, UB_HARD))) + goto nopre_kmem; + + task_bc->file_precharged += n; + get_beancounter_batch(task_bc->task_ub, n); + task_bc->file_count++; + if (task_bc->file_quant < UB_FILE_MAXQUANT && + task_bc->file_count >= task_bc->file_quant) { + task_bc->file_quant++; + task_bc->file_count = 0; + } + *kmemsize = kmem; + return n; + +nopre_kmem: + __uncharge_beancounter_locked(ub, UB_NUMFILE, n); +nopre: + if (task_bc->file_quant > UB_FILE_MINQUANT) + task_bc->file_quant--; + task_bc->file_count = 0; + return 0; +} + +int ub_file_charge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; + unsigned long file_nr, kmem; + unsigned long flags; + int err; + + task_bc = ¤t->task_bc; + ub = get_exec_ub(); + if (unlikely(ub != task_bc->task_ub)) + goto just_charge; + + if (likely(task_bc->file_precharged > 0)) { + /* + * files are put via RCU in 2.6.16 so during + * this decrement an IRQ can happen and called + * ub_files_uncharge() will mess file_precharged + * + * ub_task_uncharge() is called via RCU also so no + * protection is needed there + * + * Xemul + */ + + local_irq_save(flags); + task_bc->file_precharged--; + local_irq_restore(flags); + + f->f_ub = ub; + return 0; + } + + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); + file_nr = ub_file_precharge(task_bc, pub, &kmem); + if (unlikely(!file_nr)) + goto last_try; + spin_unlock(&pub->ub_lock); + task_bc->file_precharged--; + local_irq_restore(flags); + + charge_beancounter_notop(ub, UB_NUMFILE, file_nr); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); + f->f_ub = ub; + return 0; + +just_charge: + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); +last_try: + kmem = ub_file_kmemsize(1); + err = __charge_beancounter_locked(pub, UB_NUMFILE, 1, UB_HARD); + if (likely(!err)) { + err = __charge_beancounter_locked(pub, UB_KMEMSIZE, + kmem, UB_HARD); + if (unlikely(err)) + __uncharge_beancounter_locked(pub, UB_NUMFILE, 1); + } + spin_unlock_irqrestore(&pub->ub_lock, flags); + if (likely(!err)) { + charge_beancounter_notop(ub, UB_NUMFILE, 1); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); + f->f_ub = get_beancounter(ub); + } + return err; +} + +static inline int task_precharge_farnr(struct task_beancounter *task_bc) +{ + return (task_bc->file_precharged < (1UL << task_bc->file_quant)); +} + +void ub_file_uncharge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; + int nr; + + ub = f->f_ub; + task_bc = ¤t->task_bc; + if (likely(ub == task_bc->task_ub)) { + task_bc->file_precharged++; + pub = top_beancounter(ub); + if (task_precharge_farnr(task_bc) && + ub_barrier_farsz(pub, UB_KMEMSIZE)) + return; + nr = task_bc->file_precharged + - (1UL << (task_bc->file_quant - 1)); + if (nr > 0) { + task_bc->file_precharged -= nr; + __put_beancounter_batch(ub, nr); + uncharge_beancounter(ub, UB_NUMFILE, nr); + uncharge_beancounter(ub, UB_KMEMSIZE, + ub_file_kmemsize(nr)); + } + } else { + uncharge_beancounter(ub, UB_NUMFILE, 1); + uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(1)); + put_beancounter(ub); + } +} + +int ub_flock_charge(struct file_lock *fl, int hard) +{ + struct user_beancounter *ub; + int err; + + /* No need to get_beancounter here since it's already got in slab */ + ub = slab_ub(fl); + if (ub == NULL) + return 0; + + err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT); + if (!err) + fl->fl_charged = 1; + return err; +} + +void ub_flock_uncharge(struct file_lock *fl) +{ + struct user_beancounter *ub; + + /* Ub will be put in slab */ + ub = slab_ub(fl); + if (ub == NULL || !fl->fl_charged) + return; + + uncharge_beancounter(ub, UB_NUMFLOCK, 1); + fl->fl_charged = 0; +} + +/* + * Signal handling + */ + +static int do_ub_siginfo_charge(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD)) + goto out_kmem; + + if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD)) + goto out_num; + + spin_unlock_irqrestore(&ub->ub_lock, flags); + return 0; + +out_num: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); +out_kmem: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +static void do_ub_siginfo_uncharge(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub) +{ + unsigned long size; + struct user_beancounter *p, *q; + + size = CHARGE_SIZE(kmem_obj_objuse(sq)); + for (p = ub; p != NULL; p = p->parent) { + if (do_ub_siginfo_charge(p, size)) + goto unroll; + } + + sq->sig_ub = get_beancounter(ub); + return 0; + +unroll: + for (q = ub; q != p; q = q->parent) + do_ub_siginfo_uncharge(q, size); + return -ENOMEM; +} +EXPORT_SYMBOL(ub_siginfo_charge); + +void ub_siginfo_uncharge(struct sigqueue *sq) +{ + unsigned long size; + struct user_beancounter *ub, *p; + + p = ub = sq->sig_ub; + sq->sig_ub = NULL; + size = CHARGE_SIZE(kmem_obj_objuse(sq)); + for (; ub != NULL; ub = ub->parent) + do_ub_siginfo_uncharge(ub, size); + put_beancounter(p); +} + +/* + * PTYs + */ + +int ub_pty_charge(struct tty_struct *tty) +{ + struct user_beancounter *ub; + int retval; + + ub = slab_ub(tty); + retval = 0; + if (ub && tty->driver->subtype == PTY_TYPE_MASTER && + !test_bit(TTY_CHARGED, &tty->flags)) { + retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD); + if (!retval) + set_bit(TTY_CHARGED, &tty->flags); + } + return retval; +} + +void ub_pty_uncharge(struct tty_struct *tty) +{ + struct user_beancounter *ub; + + ub = slab_ub(tty); + if (ub && tty->driver->subtype == PTY_TYPE_MASTER && + test_bit(TTY_CHARGED, &tty->flags)) { + uncharge_beancounter(ub, UB_NUMPTY, 1); + clear_bit(TTY_CHARGED, &tty->flags); + } +} diff -urNp linux-2.6.32.48/kernel/bc/net.c linux-2.6.32.48-openvz/kernel/bc/net.c --- linux-2.6.32.48/kernel/bc/net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/net.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,1167 @@ +/* + * linux/kernel/bc/net.c + * + * Copyright (C) 1998-2004 Andrey V. Savochkin + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * TODO: + * - sizeof(struct inode) charge + * = tcp_mem_schedule() feedback based on ub limits + * + measures so that one socket won't exhaust all send buffers, + * see bug in bugzilla + * = sk->socket check for NULL in snd_wakeups + * (tcp_write_space checks for NULL itself) + * + in tcp_close(), orphaned socket abortion should be based on ubc + * resources (same in tcp_out_of_resources) + * Beancounter should also have separate orphaned socket counter... + * + for rcv, in-order segment should be accepted + * if only barrier is exceeded + * = tcp_rmem_schedule() feedback based on ub limits + * - repair forward_alloc mechanism for receive buffers + * It's idea is that some buffer space is pre-charged so that receive fast + * path doesn't need to take spinlocks and do other heavy stuff + * + tcp_prune_queue actions based on ub limits + * + window adjustments depending on available buffers for receive + * - window adjustments depending on available buffers for send + * + race around usewreserv + * + avoid allocating new page for each tiny-gram, see letter from ANK + * + rename ub_sock_lock + * + sk->sleep wait queue probably can be used for all wakeups, and + * sk->ub_wait is unnecessary + * + for UNIX sockets, the current algorithm will lead to + * UB_UNIX_MINBUF-sized messages only for non-blocking case + * - charge for af_packet sockets + * + all datagram sockets should be charged to NUMUNIXSOCK + * - we do not charge for skb copies and clones staying in device queues + * + live-lock if number of sockets is big and buffer limits are small + * [diff-ubc-dbllim3] + * - check that multiple readers/writers on the same socket won't cause fatal + * consequences + * - check allocation/charge orders + * + There is potential problem with callback_lock. In *snd_wakeup we take + * beancounter first, in sock_def_error_report - callback_lock first. + * then beancounter. This is not a problem if callback_lock taken + * readonly, but anyway... + * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator + * General kernel problems: + * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC + * notification won't get signals + * - datagram_poll looks racy + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* by some reason it is not used currently */ +#define UB_SOCK_MAINTAIN_WMEMPRESSURE 0 + + +/* Skb truesize definition. Bad place. Den */ + +static inline int skb_chargesize_head(struct sk_buff *skb) +{ + return skb_charge_size(skb_end_pointer(skb) - skb->head + + sizeof(struct skb_shared_info)); +} + +int skb_charge_fullsize(struct sk_buff *skb) +{ + int chargesize; + struct sk_buff *skbfrag; + + chargesize = skb_chargesize_head(skb) + + PAGE_SIZE * skb_shinfo(skb)->nr_frags; + if (likely(skb_shinfo(skb)->frag_list == NULL)) + return chargesize; + for (skbfrag = skb_shinfo(skb)->frag_list; + skbfrag != NULL; + skbfrag = skbfrag->next) { + chargesize += skb_charge_fullsize(skbfrag); + } + return chargesize; +} +EXPORT_SYMBOL(skb_charge_fullsize); + +static int ub_sock_makewreserv_locked(struct sock *sk, + int bufid, unsigned long size); + +int ub_too_many_orphans(struct sock *sk, int shift) +{ + struct percpu_counter *cnt; + + cnt = sk->sk_prot->orphan_count; + if (sock_has_ubc(sk)) { + struct user_beancounter *ub; + unsigned int orphans, limit; + + ub = top_beancounter(sock_bc(sk)->ub); + limit = ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2; + cnt = &ub->ub_orphan_count; + + orphans = percpu_counter_read_positive(cnt); + if ((orphans << shift) >= limit) + return 1; + + orphans = percpu_counter_sum_positive(cnt); + if ((orphans << shift) >= limit) + return 1; + } + + return tcp_too_many_orphans(sk, cnt, shift); +} + +/* + * Queueing + */ + +static void ub_sock_snd_wakeup(struct user_beancounter *ub) +{ + struct list_head *p; + struct sock *sk; + struct sock_beancounter *skbc; + struct socket *sock; + unsigned long added; + + while (!list_empty(&ub->ub_other_sk_list)) { + p = ub->ub_other_sk_list.next; + skbc = list_entry(p, struct sock_beancounter, ub_sock_list); + sk = skbc_sock(skbc); + + added = 0; + sock = sk->sk_socket; + if (sock == NULL) { + /* sk being destroyed */ + list_del_init(&skbc->ub_sock_list); + continue; + } + + ub_debug(UBD_NET_SLEEP, + "Checking queue, waiting %lu, reserv %lu\n", + skbc->ub_waitspc, skbc->poll_reserv); + added = -skbc->poll_reserv; + if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, + skbc->ub_waitspc)) + break; + added += skbc->poll_reserv; + + list_del_init(&skbc->ub_sock_list); + + /* + * See comments in ub_tcp_snd_wakeup. + * Locking note: both unix_write_space and + * sock_def_write_space take callback_lock themselves. + * We take it here just to be on the safe side and to + * act the same way as ub_tcp_snd_wakeup does. + */ + sock_hold(sk); + read_lock(&sk->sk_callback_lock); + spin_unlock(&ub->ub_lock); + + sk->sk_write_space(sk); + read_unlock(&sk->sk_callback_lock); + + if (skbc->ub != ub && added) + charge_beancounter_notop(skbc->ub, + UB_OTHERSOCKBUF, added); + sock_put(sk); + + spin_lock(&ub->ub_lock); + } +} + +static void ub_tcp_snd_wakeup(struct user_beancounter *ub) +{ + struct list_head *p; + struct sock *sk; + struct sock_beancounter *skbc; + struct socket *sock; + unsigned long added; + + while (!list_empty(&ub->ub_tcp_sk_list)) { + p = ub->ub_tcp_sk_list.next; + skbc = list_entry(p, struct sock_beancounter, ub_sock_list); + sk = skbc_sock(skbc); + + added = 0; + sock = sk->sk_socket; + if (sock == NULL) { + /* sk being destroyed */ + list_del_init(&skbc->ub_sock_list); + continue; + } + + ub_debug(UBD_NET_SLEEP, + "Checking queue, waiting %lu, reserv %lu\n", + skbc->ub_waitspc, skbc->poll_reserv); + added = -skbc->poll_reserv; + if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, + skbc->ub_waitspc)) + break; + added += skbc->poll_reserv; + + list_del_init(&skbc->ub_sock_list); + + /* + * Send async notifications and wake up. + * Locking note: we get callback_lock here because + * tcp_write_space is over-optimistic about calling context + * (socket lock is presumed). So we get the lock here although + * it belongs to the callback. + */ + sock_hold(sk); + read_lock(&sk->sk_callback_lock); + spin_unlock(&ub->ub_lock); + + sk->sk_write_space(sk); + read_unlock(&sk->sk_callback_lock); + + if (skbc->ub != ub && added) + charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added); + sock_put(sk); + + spin_lock(&ub->ub_lock); + } +} + +int ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size) +{ + unsigned long flags; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long added_reserv; + + if (!sock_has_ubc(sk)) + return 0; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size); + added_reserv = -skbc->poll_reserv; + if (!ub_sock_makewreserv_locked(sk, res, size)) { + /* + * It looks a bit hackish, but it is compatible with both + * wait_for_xx_ubspace and poll. + * This __set_current_state is equivalent to a wakeup event + * right after spin_unlock_irqrestore. + */ + __set_current_state(TASK_RUNNING); + added_reserv += skbc->poll_reserv; + spin_unlock_irqrestore(&ub->ub_lock, flags); + if (added_reserv) + charge_beancounter_notop(skbc->ub, res, added_reserv); + return 0; + } + + ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n"); + skbc->ub_waitspc = size; + if (!list_empty(&skbc->ub_sock_list)) { + ub_debug(UBD_NET_SOCKET, + "re-adding socket to beancounter %p.\n", ub); + goto out; + } + + switch (res) { + case UB_TCPSNDBUF: + list_add_tail(&skbc->ub_sock_list, + &ub->ub_tcp_sk_list); + break; + case UB_OTHERSOCKBUF: + list_add_tail(&skbc->ub_sock_list, + &ub->ub_other_sk_list); + break; + default: + BUG(); + } +out: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +EXPORT_SYMBOL(ub_sock_snd_queue_add); + +long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + set_current_state(TASK_INTERRUPTIBLE); + if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size)) + break; + + if (sk->sk_shutdown & SEND_SHUTDOWN) + break; + if (sk->sk_err) + break; + ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size); + timeo = schedule_timeout(timeo); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return timeo; +} + +void ub_sock_sndqueuedel(struct sock *sk) +{ + struct user_beancounter *ub; + struct sock_beancounter *skbc; + unsigned long flags; + + if (!sock_has_ubc(sk)) + return; + skbc = sock_bc(sk); + + /* race with write_space callback of other socket */ + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + list_del_init(&skbc->ub_sock_list); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +/* + * Helpers + */ + +static inline void __ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, + unsigned long size, int resource) +{ + WARN_ON_ONCE(skb_bc(skb)->ub != NULL); + + skb_bc(skb)->ub = sock_bc(sk)->ub; + skb_bc(skb)->charged = size; + skb_bc(skb)->resource = resource; +} + +void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, + unsigned long size, int resource) +{ + if (!sock_has_ubc(sk)) + return; + + if (sock_bc(sk)->ub == NULL) + BUG(); + + __ub_skb_set_charge(skb, sk, size, resource); + + /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */ + if (skb->sk == NULL) + skb->sk = sk; +} + +EXPORT_SYMBOL(ub_skb_set_charge); + +static inline void ub_skb_set_uncharge(struct sk_buff *skb) +{ + skb_bc(skb)->ub = NULL; + skb_bc(skb)->charged = 0; + skb_bc(skb)->resource = 0; +} + +static void ub_update_rmem_thres(struct sock_beancounter *skub) +{ + struct user_beancounter *ub; + + if (skub && skub->ub) { + ub = top_beancounter(skub->ub); + ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier / + (ub->ub_parms[UB_NUMTCPSOCK].held + 1); + } +} + +static inline void ub_sock_wcharge_dec(struct sock *sk, + unsigned long chargesize) +{ + /* The check sk->sk_family != PF_NETLINK is made as the skb is + * queued to the kernel end of socket while changed to the user one. + * Den */ + if (unlikely(sock_bc(sk)->ub_wcharged) && sk->sk_family != PF_NETLINK) { + if (sock_bc(sk)->ub_wcharged > chargesize) + sock_bc(sk)->ub_wcharged -= chargesize; + else + sock_bc(sk)->ub_wcharged = 0; + } +} + +/* + * Charge socket number + */ + +static inline void sk_alloc_beancounter(struct sock *sk) +{ + struct sock_beancounter *skbc; + + skbc = sock_bc(sk); + memset(skbc, 0, sizeof(struct sock_beancounter)); +} + +static inline void sk_free_beancounter(struct sock *sk) +{ +} + +static int __sock_charge(struct sock *sk, int res) +{ + struct sock_beancounter *skbc; + struct user_beancounter *cub, *ub; + unsigned long added_reserv, added_forw; + unsigned long flags; + + cub = get_exec_ub(); + if (unlikely(cub == NULL)) + return 0; + + sk_alloc_beancounter(sk); + skbc = sock_bc(sk); + INIT_LIST_HEAD(&skbc->ub_sock_list); + + ub = top_beancounter(cub); + spin_lock_irqsave(&ub->ub_lock, flags); + if (unlikely(__charge_beancounter_locked(ub, res, 1, UB_HARD) < 0)) + goto out_limit; + + added_reserv = 0; + added_forw = 0; + if (res == UB_NUMTCPSOCK) { + added_reserv = skb_charge_size(MAX_TCP_HEADER + + 1500 - sizeof(struct iphdr) - + sizeof(struct tcphdr)); + added_reserv *= 4; + ub->ub_parms[UB_TCPSNDBUF].held += added_reserv; + if (!ub_barrier_farsz(ub, UB_TCPSNDBUF)) { + ub->ub_parms[UB_TCPSNDBUF].held -= added_reserv; + added_reserv = 0; + } + skbc->poll_reserv = added_reserv; + + added_forw = SK_MEM_QUANTUM * 4; + ub->ub_parms[UB_TCPRCVBUF].held += added_forw; + if (!ub_barrier_farsz(ub, UB_TCPRCVBUF)) { + ub->ub_parms[UB_TCPRCVBUF].held -= added_forw; + added_forw = 0; + } + skbc->forw_space = added_forw; + } + spin_unlock_irqrestore(&ub->ub_lock, flags); + + charge_beancounter_notop(cub, res, 1); + if (added_reserv) + charge_beancounter_notop(cub, UB_TCPSNDBUF, added_reserv); + if (added_forw) + charge_beancounter_notop(cub, UB_TCPRCVBUF, added_forw); + + skbc->ub = get_beancounter(cub); + return 0; + +out_limit: + spin_unlock_irqrestore(&ub->ub_lock, flags); + sk_free_beancounter(sk); + return -ENOMEM; +} + +int ub_tcp_sock_charge(struct sock *sk) +{ + int ret; + + ret = __sock_charge(sk, UB_NUMTCPSOCK); + ub_update_rmem_thres(sock_bc(sk)); + + return ret; +} + +int ub_other_sock_charge(struct sock *sk) +{ + return __sock_charge(sk, UB_NUMOTHERSOCK); +} + +EXPORT_SYMBOL(ub_other_sock_charge); + +int ub_sock_charge(struct sock *sk, int family, int type) +{ + return (IS_TCP_SOCK(family, type) ? + ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk)); +} + +EXPORT_SYMBOL(ub_sock_charge); + +/* + * Uncharge socket number + */ + +void ub_sock_uncharge(struct sock *sk) +{ + int is_tcp_sock; + unsigned long flags; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long reserv, forw; + + if (unlikely(!sock_has_ubc(sk))) + return; + + is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type); + skbc = sock_bc(sk); + ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk); + + ub = top_beancounter(skbc->ub); + + spin_lock_irqsave(&ub->ub_lock, flags); + if (!list_empty(&skbc->ub_sock_list)) { + ub_debug(UBD_NET_SOCKET, + "ub_sock_uncharge: removing from ub(%p) queue.\n", + skbc); + list_del_init(&skbc->ub_sock_list); + } + + reserv = skbc->poll_reserv; + forw = skbc->forw_space; + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), + reserv); + if (forw) + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), + forw); + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); + + ub_sock_wcharge_dec(sk, reserv); + if (unlikely(skbc->ub_wcharged)) + printk(KERN_WARNING + "ub_sock_uncharge: wch=%lu for ub %p (%d).\n", + skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid); + skbc->poll_reserv = 0; + skbc->forw_space = 0; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), + reserv); + if (forw) + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), + forw); + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); + + put_beancounter(skbc->ub); + sk_free_beancounter(sk); +} + +/* + * Special case for netlink_dump - (un)charges precalculated size + */ + +int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk) +{ + int ret; + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + chargesize = skb_charge_fullsize(skb); + ret = charge_beancounter(sock_bc(sk)->ub, + UB_OTHERSOCKBUF, chargesize, UB_HARD); + if (ret < 0) + return ret; + ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); + return ret; +} + +/* + * Poll reserve accounting + * + * This is the core of socket buffer management (along with queueing/wakeup + * functions. The rest of buffer accounting either call these functions, or + * repeat parts of their logic for some simpler cases. + */ + +static int ub_sock_makewreserv_locked(struct sock *sk, + int bufid, unsigned long size) +{ + unsigned long wcharge_added; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + + skbc = sock_bc(sk); + if (skbc->poll_reserv >= size) /* no work to be done */ + goto out; + + ub = top_beancounter(skbc->ub); + ub->ub_parms[bufid].held += size - skbc->poll_reserv; + + wcharge_added = 0; + /* + * Logic: + * 1) when used memory hits barrier, we set wmem_pressure; + * wmem_pressure is reset under barrier/2; + * between barrier/2 and barrier we limit per-socket buffer growth; + * 2) each socket is guaranteed to get (limit-barrier)/maxsockets + * calculated on the base of memory eaten after the barrier is hit + */ + skbc = sock_bc(sk); +#if UB_SOCK_MAINTAIN_WMEMPRESSURE + if (!ub_hfbarrier_hit(ub, bufid)) { + if (ub->ub_wmem_pressure) + ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, + ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_wmem_pressure = 0; + } +#endif + if (ub_barrier_hit(ub, bufid)) { +#if UB_SOCK_MAINTAIN_WMEMPRESSURE + if (!ub->ub_wmem_pressure) + ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, + ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_wmem_pressure = 1; +#endif + if (sk->sk_family == PF_NETLINK) + goto unroll; + wcharge_added = size - skbc->poll_reserv; + skbc->ub_wcharged += wcharge_added; + if (skbc->ub_wcharged * ub->ub_parms[bid2sid(bufid)].limit + + ub->ub_parms[bufid].barrier > + ub->ub_parms[bufid].limit) + goto unroll_wch; + } + if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit) + goto unroll; + + ub_adjust_maxheld(ub, bufid); + skbc->poll_reserv = size; +out: + return 0; + +unroll_wch: + skbc->ub_wcharged -= wcharge_added; +unroll: + ub_debug(UBD_NET_SEND, + "makewres: deny " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_parms[bufid].failcnt++; + ub->ub_parms[bufid].held -= size - skbc->poll_reserv; + + if (sk->sk_socket != NULL) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + return -ENOMEM; +} + +int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + unsigned long added_reserv; + int err; + + skbc = sock_bc(sk); + + /* + * This function provides that there is sufficient reserve upon return + * only if sk has only one user. We can check poll_reserv without + * serialization and avoid locking if the reserve already exists. + */ + if (unlikely(!sock_has_ubc(sk)) || likely(skbc->poll_reserv >= size)) + return 0; + + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + added_reserv = -skbc->poll_reserv; + err = ub_sock_makewreserv_locked(sk, bufid, size); + added_reserv += skbc->poll_reserv; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (added_reserv) + charge_beancounter_notop(skbc->ub, bufid, added_reserv); + + return err; +} + +EXPORT_SYMBOL(ub_sock_make_wreserv); + +int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size) +{ + struct sock_beancounter *skbc; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + /* optimize for the case if socket has sufficient reserve */ + ub_sock_make_wreserv(sk, bufid, size); + skbc = sock_bc(sk); + if (likely(skbc->poll_reserv >= size)) { + skbc->poll_reserv -= size; + return 0; + } + return -ENOMEM; +} + +EXPORT_SYMBOL(ub_sock_get_wreserv); + +static void ub_sock_do_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long extra; + unsigned long flags; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + + extra = 0; + spin_lock_irqsave(&ub->ub_lock, flags); + skbc->poll_reserv += size; + if (skbc->poll_reserv > ressize) { + extra = skbc->poll_reserv - ressize; + ub_sock_wcharge_dec(sk, extra); + skbc->poll_reserv = ressize; + + __uncharge_beancounter_locked(ub, bufid, extra); + if (bufid == UB_TCPSNDBUF) + ub_tcp_snd_wakeup(ub); + else + ub_sock_snd_wakeup(ub); + } + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (extra) + uncharge_beancounter_notop(skbc->ub, bufid, extra); +} + +void ub_sock_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + + if (unlikely(!sock_has_ubc(sk))) + return; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + /* check if the reserve can be kept */ + if (ub_barrier_farsz(ub, bufid)) { + skbc->poll_reserv += size; + return; + } + ub_sock_do_ret_wreserv(sk, bufid, size, ressize); +} + +/* + * UB_DGRAMRCVBUF + */ + +static int ub_dgramrcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + unsigned long chargesize; + + chargesize = skb_charge_fullsize(skb); + if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF, + chargesize, UB_HARD)) + return -ENOMEM; + + ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); + return 0; +} + +int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + if (unlikely(!sock_has_ubc(sk))) + return 0; + + if (IS_TCP_SOCK(sk->sk_family, sk->sk_type)) + return ub_tcprcvbuf_charge(sk, skb); + else + return ub_dgramrcvbuf_charge(sk, skb); +} + +EXPORT_SYMBOL(ub_sockrcvbuf_charge); + +static void ub_sockrcvbuf_uncharge(struct sk_buff *skb) +{ + uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF, + skb_bc(skb)->charged); + ub_skb_set_uncharge(skb); +} + +/* + * UB_TCPRCVBUF + */ + +int ub_sock_tcp_chargerecv(struct sock *sk, struct sk_buff *skb, + enum ub_severity strict) +{ + int retval; + unsigned long flags; + struct user_beancounter *ub; + struct sock_beancounter *skbc; + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + skbc = sock_bc(sk); + + chargesize = skb_charge_fullsize(skb); + if (likely(skbc->forw_space >= chargesize)) { + skbc->forw_space -= chargesize; + __ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); + return 0; + } + + /* + * Memory pressure reactions: + * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND) + * 2) set UB_RMEM_SHRINK and tcp_clamp_window() + * tcp_collapse_queues() if rmem_alloc > rcvbuf + * 3) drop OFO, tcp_purge_ofo() + * 4) drop all. + * Currently, we do #2 and #3 at once (which means that current + * collapsing of OFO queue in tcp_collapse_queues() is a waste of time, + * for example...) + * On memory pressure we jump from #0 to #3, and when the pressure + * subsides, to #1. + */ + retval = 0; + ub = top_beancounter(sock_bc(sk)->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_parms[UB_TCPRCVBUF].held += chargesize; + if (ub->ub_parms[UB_TCPRCVBUF].held > + ub->ub_parms[UB_TCPRCVBUF].barrier && + strict != UB_FORCE) + goto excess; + ub_adjust_maxheld(ub, UB_TCPRCVBUF); + spin_unlock_irqrestore(&ub->ub_lock, flags); + +out: + if (retval == 0) { + charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF, + chargesize); + ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); + } + return retval; + +excess: + ub->ub_rmem_pressure = UB_RMEM_SHRINK; + if (strict == UB_HARD) + retval = -ENOMEM; + if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit) + retval = -ENOMEM; + /* + * We try to leave numsock*maxadvmss as a reserve for sockets not + * queueing any data yet (if the difference between the barrier and the + * limit is enough for this reserve). + */ + if (ub->ub_parms[UB_TCPRCVBUF].held + + ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss + > ub->ub_parms[UB_TCPRCVBUF].limit && + atomic_read(&sk->sk_rmem_alloc)) + retval = -ENOMEM; + if (retval) { + ub->ub_parms[UB_TCPRCVBUF].held -= chargesize; + ub->ub_parms[UB_TCPRCVBUF].failcnt++; + } + ub_adjust_maxheld(ub, UB_TCPRCVBUF); + spin_unlock_irqrestore(&ub->ub_lock, flags); + goto out; +} +EXPORT_SYMBOL(ub_sock_tcp_chargerecv); + +static void ub_tcprcvbuf_uncharge(struct sk_buff *skb) +{ + unsigned long flags; + unsigned long held, bar; + int prev_pres; + struct user_beancounter *ub; + + ub = top_beancounter(skb_bc(skb)->ub); + if (ub_barrier_farsz(ub, UB_TCPRCVBUF)) { + sock_bc(skb->sk)->forw_space += skb_bc(skb)->charged; + ub_skb_set_uncharge(skb); + return; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) { + printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n", + skb_bc(skb)->charged, + ub, ub->ub_parms[UB_TCPRCVBUF].held); + /* ass-saving bung */ + skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held; + } + ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged; + held = ub->ub_parms[UB_TCPRCVBUF].held; + bar = ub->ub_parms[UB_TCPRCVBUF].barrier; + prev_pres = ub->ub_rmem_pressure; + if (held <= bar - (bar >> 2)) + ub->ub_rmem_pressure = UB_RMEM_EXPAND; + else if (held <= bar) + ub->ub_rmem_pressure = UB_RMEM_KEEP; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF, + skb_bc(skb)->charged); + ub_skb_set_uncharge(skb); +} + + +/* + * UB_OTHERSOCKBUF and UB_TCPSNDBUF + */ + +static void ub_socksndbuf_uncharge(struct sk_buff *skb) +{ + unsigned long flags; + struct user_beancounter *ub, *cub; + unsigned long chargesize; + + cub = skb_bc(skb)->ub; + ub = top_beancounter(cub); + chargesize = skb_bc(skb)->charged; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_OTHERSOCKBUF, chargesize); + if (skb->sk != NULL && sock_has_ubc(skb->sk)) + ub_sock_wcharge_dec(skb->sk, chargesize); + ub_sock_snd_wakeup(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, chargesize); + ub_skb_set_uncharge(skb); +} + +/* expected to be called under socket lock */ +static void ub_tcpsndbuf_uncharge(struct sk_buff *skb) +{ + /* + * ub_sock_ret_wreserv call is abused here, we just want to uncharge + * skb size. However, to reduce duplication of the code doing + * ub_hfbarrier_hit check, ub_wcharged reduction, and wakeup we call + * a function that already does all of this. 2006/04/27 SAW + */ + ub_sock_ret_wreserv(skb->sk, UB_TCPSNDBUF, skb_bc(skb)->charged, + sock_bc(skb->sk)->poll_reserv); + ub_skb_set_uncharge(skb); +} + +void ub_skb_uncharge(struct sk_buff *skb) +{ + switch (skb_bc(skb)->resource) { + case UB_TCPSNDBUF: + ub_tcpsndbuf_uncharge(skb); + break; + case UB_TCPRCVBUF: + ub_tcprcvbuf_uncharge(skb); + break; + case UB_DGRAMRCVBUF: + ub_sockrcvbuf_uncharge(skb); + break; + case UB_OTHERSOCKBUF: + ub_socksndbuf_uncharge(skb); + break; + } +} + +EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */ + +/* + * Other sock reserve managment + */ + +int ub_sock_getwres_other(struct sock *sk, unsigned long size) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + unsigned long added_reserv; + int err; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + /* + * Nothing except beancounter lock protects skbc->poll_reserv. + * So, take the lock and do the job. + * Dances with added_reserv repeat ub_sock_make_wreserv. + */ + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + added_reserv = -skbc->poll_reserv; + err = ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, size); + added_reserv += skbc->poll_reserv; + if (!err) + skbc->poll_reserv -= size; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (added_reserv) + charge_beancounter_notop(skbc->ub, UB_OTHERSOCKBUF, added_reserv); + + return err; +} +EXPORT_SYMBOL(ub_sock_getwres_other); + +void ub_sock_retwres_other(struct sock *sk, + unsigned long size, unsigned long ressize) +{ + if (unlikely(!sock_has_ubc(sk))) + return; + + ub_sock_do_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize); +} + +/* + * TCP send buffers accouting. Paged part + */ + +int ub_sock_tcp_chargepage(struct sock *sk) +{ + struct sock_beancounter *skbc; + unsigned long extra; + int err; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + skbc = sock_bc(sk); + ub_sock_make_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE); + if (likely(skbc->poll_reserv >= PAGE_SIZE)) { + skbc->poll_reserv -= PAGE_SIZE; + return 0; + } + + /* + * Ok, full page is not available. + * However, this function must succeed if poll previously indicated + * that write is possible. We better make a forced charge here + * than reserve a whole page in poll. + */ + err = ub_sock_make_wreserv(sk, UB_TCPSNDBUF, SOCK_MIN_UBCSPACE); + if (unlikely(err < 0)) + goto out; + if (skbc->poll_reserv < PAGE_SIZE) { + extra = PAGE_SIZE - skbc->poll_reserv; + err = charge_beancounter(skbc->ub, UB_TCPSNDBUF, extra, + UB_FORCE); + if (err < 0) + goto out; + skbc->poll_reserv += extra; + } + skbc->poll_reserv -= PAGE_SIZE; + return 0; + +out: + return err; +} + +void ub_sock_tcp_detachpage(struct sock *sk) +{ + struct sk_buff *skb; + + if (unlikely(!sock_has_ubc(sk))) + return; + + /* The page is just detached from socket. The last skb in queue + with paged part holds referrence to it */ + skb = skb_peek_tail(&sk->sk_write_queue); + if (skb == NULL) { + /* If the queue is empty - all data is sent and page is about + to be freed */ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE, + sock_bc(sk)->poll_reserv); + } else { + /* Last skb is a good aproximation for a last skb with + paged part */ + skb_bc(skb)->charged += PAGE_SIZE; + } +} + +/* + * TCPSNDBUF charge functions below are called in the following cases: + * - sending of SYN, SYN-ACK, FIN, the latter charge is forced by + * some technical reasons in TCP code; + * - fragmentation of TCP packets. + * These functions are allowed but not required to use poll_reserv. + * Originally, these functions didn't do that, since it didn't make + * any sense. Now, since poll_reserv now has a function of general reserve, + * they use it. + */ +int ub_sock_tcp_chargesend(struct sock *sk, struct sk_buff *skb, + enum ub_severity strict) +{ + int ret; + unsigned long chargesize; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + skbc = sock_bc(sk); + chargesize = skb_charge_fullsize(skb); + if (likely(skbc->poll_reserv >= chargesize)) { + skbc->poll_reserv -= chargesize; + __ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + /* XXX hack, see ub_skb_set_charge */ + skb->sk = sk; + return 0; + } + + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ret = __charge_beancounter_locked(ub, UB_TCPSNDBUF, + chargesize, strict); + /* + * Note: this check is not equivalent of the corresponding check + * in makewreserv. It's similar in spirit, but an equivalent check + * would be too long and complicated here. + */ + if (!ret && ub_barrier_hit(ub, UB_TCPSNDBUF)) + skbc->ub_wcharged += chargesize; + spin_unlock_irqrestore(&ub->ub_lock, flags); + if (likely(!ret)) { + charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, chargesize); + ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + } + return ret; +} +EXPORT_SYMBOL(ub_sock_tcp_chargesend); + +/* + * Initialization + */ + +int __init skbc_cache_init(void) +{ + return 0; +} diff -urNp linux-2.6.32.48/kernel/bc/oom_kill.c linux-2.6.32.48-openvz/kernel/bc/oom_kill.c --- linux-2.6.32.48/kernel/bc/oom_kill.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/oom_kill.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define UB_OOM_TIMEOUT (5 * HZ) + +int oom_generation; +int oom_kill_counter; +static DEFINE_SPINLOCK(oom_lock); +static DECLARE_WAIT_QUEUE_HEAD(oom_wq); + +static inline int ub_oom_completed(struct task_struct *tsk) +{ + if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) + /* we were oom killed - just die */ + return 1; + if (tsk->task_bc.oom_generation != oom_generation) + /* some task was succesfully killed */ + return 1; + return 0; +} + +static void ub_clear_oom(void) +{ + struct user_beancounter *ub; + + rcu_read_lock(); + for_each_beancounter(ub) + ub->ub_oom_noproc = 0; + rcu_read_unlock(); +} + +int ub_oom_lock(void) +{ + int timeout; + DEFINE_WAIT(oom_w); + struct task_struct *tsk; + + tsk = current; + + spin_lock(&oom_lock); + if (!oom_kill_counter) + goto out_do_oom; + + timeout = UB_OOM_TIMEOUT; + while (1) { + if (ub_oom_completed(tsk)) { + spin_unlock(&oom_lock); + return -EINVAL; + } + + if (timeout == 0) + break; + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&oom_wq, &oom_w); + spin_unlock(&oom_lock); + + timeout = schedule_timeout(timeout); + + spin_lock(&oom_lock); + remove_wait_queue(&oom_wq, &oom_w); + } + +out_do_oom: + ub_clear_oom(); + return 0; +} + +static inline long ub_current_overdraft(struct user_beancounter *ub) +{ + return ub->ub_parms[UB_OOMGUARPAGES].held + + ((ub->ub_parms[UB_KMEMSIZE].held + + ub->ub_parms[UB_TCPSNDBUF].held + + ub->ub_parms[UB_TCPRCVBUF].held + + ub->ub_parms[UB_OTHERSOCKBUF].held + + ub->ub_parms[UB_DGRAMRCVBUF].held) + >> PAGE_SHIFT) - ub->ub_parms[UB_OOMGUARPAGES].barrier; +} + +int ub_oom_task_skip(struct user_beancounter *ub, struct task_struct *tsk) +{ + struct user_beancounter *mm_ub; + + if (ub == NULL) + return 0; + + task_lock(tsk); + if (tsk->mm == NULL) + mm_ub = NULL; + else + mm_ub = tsk->mm->mm_ub; + + while (mm_ub != NULL && mm_ub != ub) + mm_ub = mm_ub->parent; + task_unlock(tsk); + + return mm_ub != ub; +} + +struct user_beancounter *ub_oom_select_worst(void) +{ + struct user_beancounter *ub, *walkp; + long ub_maxover; + + ub_maxover = 0; + ub = NULL; + + rcu_read_lock(); + for_each_beancounter (walkp) { + long ub_overdraft; + + if (walkp->parent != NULL) + continue; + if (walkp->ub_oom_noproc) + continue; + + ub_overdraft = ub_current_overdraft(walkp); + if (ub_overdraft > ub_maxover && get_beancounter_rcu(walkp)) { + put_beancounter(ub); + ub = walkp; + ub_maxover = ub_overdraft; + } + } + + if (ub) + ub->ub_oom_noproc = 1; + rcu_read_unlock(); + + return ub; +} + +void ub_oom_mm_killed(struct user_beancounter *ub) +{ + static struct ub_rate_info ri = { 5, 60*HZ }; + + /* increment is serialized with oom_lock */ + ub->ub_parms[UB_OOMGUARPAGES].failcnt++; + + if (ub_ratelimit(&ri)) + show_mem(); +} + +void ub_oom_unlock(void) +{ + spin_unlock(&oom_lock); +} + +void ub_oom_task_dead(struct task_struct *tsk) +{ + spin_lock(&oom_lock); + oom_kill_counter = 0; + oom_generation++; + + printk("OOM killed process %s (pid=%d, ve=%d) exited, " + "free=%lu gen=%d.\n", + tsk->comm, tsk->pid, VEID(tsk->ve_task_info.owner_env), + nr_free_pages(), oom_generation); + /* if there is time to sleep in ub_oom_lock -> sleep will continue */ + wake_up_all(&oom_wq); + spin_unlock(&oom_lock); +} + +void ub_out_of_memory(struct user_beancounter *scope) +{ + struct user_beancounter *ub; + struct task_struct *p; + + spin_lock(&oom_lock); + ub_clear_oom(); + ub = get_beancounter(scope); + + read_lock(&tasklist_lock); +retry: + p = select_bad_process(ub, NULL); + if (p == NULL || PTR_ERR(p) == -1UL) + goto unlock; + + if (oom_kill_process(p, (gfp_t)-1, -1, NULL, "UB Out of memory")) + goto retry; + + put_beancounter(ub); + +unlock: + read_unlock(&tasklist_lock); + spin_unlock(&oom_lock); +} +EXPORT_SYMBOL(ub_out_of_memory); diff -urNp linux-2.6.32.48/kernel/bc/proc.c linux-2.6.32.48-openvz/kernel/bc/proc.c --- linux-2.6.32.48/kernel/bc/proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/proc.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,703 @@ +/* + * kernel/bc/proc.c + * + * Copyright (C) 2006 OpenVZ. SWsoft Inc. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Generic output formats */ +#if BITS_PER_LONG == 32 +const char *bc_proc_lu_fmt = "\t%-20s %10lu\n"; +const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; +const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; +const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n"; +#else +const char *bc_proc_lu_fmt = "\t%-20s %21lu\n"; +const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; +const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; +const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n"; +#endif + +#if BITS_PER_LONG == 32 +static const char *head_fmt = "%10s %-12s %10s %10s %10s %10s %10s\n"; +static const char *res_fmt = "%10s %-12s %10lu %10lu %10lu %10lu %10lu\n"; +#else +static const char *head_fmt = "%10s %-12s %20s %20s %20s %20s %20s\n"; +static const char *res_fmt = "%10s %-12s %20lu %20lu %20lu %20lu %20lu\n"; +#endif + +static void ub_show_res(struct seq_file *f, struct user_beancounter *ub, + int r, int show_uid) +{ + int len; + char ub_uid[64]; + + if (show_uid && r == 0) { + len = print_ub_uid(ub, ub_uid, sizeof(ub_uid) - 2); + ub_uid[len] = ':'; + ub_uid[len + 1] = '\0'; + } else + strcpy(ub_uid, ""); + + seq_printf(f, res_fmt, ub_uid, ub_rnames[r], + ub->ub_parms[r].held, + ub->ub_parms[r].maxheld, + ub->ub_parms[r].barrier, + ub->ub_parms[r].limit, + ub->ub_parms[r].failcnt); +} + +static void __show_resources(struct seq_file *f, struct user_beancounter *ub, + int show_uid) +{ + int i; + + for (i = 0; i < UB_RESOURCES_COMPAT; i++) + if (strcmp(ub_rnames[i], "dummy") != 0) + ub_show_res(f, ub, i, show_uid); + + for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++) + ub_show_res(f, ub, i, show_uid); +} + +static int bc_resources_show(struct seq_file *f, void *v) +{ + __show_resources(f, seq_beancounter(f), 0); + return 0; +} + +static struct bc_proc_entry bc_resources_entry = { + .name = "resources", + .u.show = bc_resources_show, +}; + +#ifdef CONFIG_UBC_DEBUG +static int bc_debug_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + char buf[64]; + + ub = seq_beancounter(f); + print_ub_uid(ub, buf, sizeof(buf)); + seq_printf(f, "uid: %s\n", buf); + seq_printf(f, "ref: %d\n", atomic_read(&ub->ub_refcount)); + + seq_printf(f, "bc: %p\n", ub); + seq_printf(f, "par: %p\n", ub->parent); + seq_printf(f, "priv: %p\n", ub->private_data); + return 0; +} + +static struct bc_proc_entry bc_debug_entry = { + .name = "debug", + .u.show = bc_debug_show, +}; +#endif + +static int ub_show(struct seq_file *f, void *v) +{ + int i; + + for (i = 0; i < UB_RESOURCES_COMPAT; i++) + ub_show_res(f, (struct user_beancounter *)v, i, 1); + return 0; +} + +static int res_show(struct seq_file *f, void *v) +{ + __show_resources(f, (struct user_beancounter *)v, 1); + return 0; +} + +static int ub_accessible(struct user_beancounter *exec, + struct user_beancounter *target) +{ + struct user_beancounter *p, *q; + + p = top_beancounter(exec); + q = top_beancounter(target); + + return (p == get_ub0() || p == q); +} + +static void ub_show_header(struct seq_file *f) +{ + seq_printf(f, "Version: 2.5\n"); + seq_printf(f, head_fmt, "uid", "resource", + "held", "maxheld", "barrier", "limit", "failcnt"); +} + +static void *ub_start(struct seq_file *f, loff_t *ppos) +{ + struct user_beancounter *ub; + struct user_beancounter *exec_ub; + unsigned long pos; + + pos = *ppos; + if (pos == 0) + ub_show_header(f); + + exec_ub = get_exec_ub(); + + rcu_read_lock(); + for_each_beancounter(ub) { + if (ub->parent != NULL) + continue; + if (!ub_accessible(exec_ub, ub)) + continue; + if (pos-- == 0) + return ub; + } + return NULL; +} + +static void *ub_next(struct seq_file *f, void *v, loff_t *ppos) +{ + struct user_beancounter *ub; + struct list_head *entry; + struct user_beancounter *exec_ub; + + exec_ub = get_exec_ub(); + ub = (struct user_beancounter *)v; + + entry = &ub->ub_list; + + list_for_each_continue_rcu(entry, &ub_list_head) { + ub = list_entry(entry, struct user_beancounter, ub_list); + if (ub->parent != NULL) + continue; + if (!ub_accessible(exec_ub, ub)) + continue; + + (*ppos)++; + return ub; + } + return NULL; +} + +static void ub_stop(struct seq_file *f, void *v) +{ + rcu_read_unlock(); +} + +static struct seq_operations ub_seq_ops = { + .start = ub_start, + .next = ub_next, + .stop = ub_stop, + .show = ub_show, +}; + +static int ub_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &ub_seq_ops); +} + +static struct file_operations ub_file_operations = { + .open = ub_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct seq_operations res_seq_ops = { + .start = ub_start, + .next = ub_next, + .stop = ub_stop, + .show = res_show, +}; + +static int res_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &res_seq_ops); +} + +static struct file_operations resources_operations = { + .open = res_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct bc_proc_entry bc_all_resources_entry = { + .name = "resources", + .u.fops = &resources_operations, +}; + +/* + * Generic showing stuff + */ + +static int cookies, num_entries; +static struct bc_proc_entry *bc_entries __read_mostly; +static struct bc_proc_entry *bc_root_entries __read_mostly; +static DEFINE_SPINLOCK(bc_entries_lock); +static struct proc_dir_entry *bc_proc_root; + +void bc_register_proc_entry(struct bc_proc_entry *e) +{ + spin_lock(&bc_entries_lock); + e->cookie = ++cookies; + e->next = bc_entries; + bc_entries = e; + num_entries++; + spin_unlock(&bc_entries_lock); +} + +EXPORT_SYMBOL(bc_register_proc_entry); + +void bc_register_proc_root_entry(struct bc_proc_entry *e) +{ + spin_lock(&bc_entries_lock); + e->cookie = ++cookies; + e->next = bc_root_entries; + bc_root_entries = e; + bc_proc_root->nlink++; + spin_unlock(&bc_entries_lock); +} + +EXPORT_SYMBOL(bc_register_proc_root_entry); + +/* + * small helpers + */ + +static inline unsigned long bc_make_ino(struct user_beancounter *ub) +{ + unsigned long ret; + + ret = 0xbc000000; + if (ub->parent) + ret |= ((ub->parent->ub_uid + 1) << 4); + ret |= (ub->ub_uid + 1); + return ret; +} + +static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de) +{ + return 0xbe000000 + de->cookie; +} + +static int bc_d_delete(struct dentry *d) +{ + return 1; +} + +static void bc_d_release(struct dentry *d) +{ + put_beancounter((struct user_beancounter *)d->d_fsdata); +} + +static struct inode_operations bc_entry_iops; +static struct file_operations bc_entry_fops; +static struct dentry_operations bc_dentry_ops = { + .d_delete = bc_d_delete, + .d_release = bc_d_release, +}; + +/* + * common directory operations' helpers + */ + +static int bc_readdir(struct file *file, filldir_t filler, void *data, + struct user_beancounter *parent) +{ + int err = 0; + loff_t pos, filled; + struct user_beancounter *ub, *prev; + struct bc_proc_entry *pde; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EPERM; + + pos = file->f_pos; + if (pos == 0) { + err = (*filler)(data, ".", 1, pos, + file->f_dentry->d_inode->i_ino, DT_DIR); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + if (pos == 1) { + err = (*filler)(data, "..", 2, pos, + parent_ino(file->f_dentry), DT_DIR); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + filled = 2; + for (pde = (parent == NULL ? bc_root_entries : bc_entries); + pde != NULL; pde = pde->next) { + if (filled++ < pos) + continue; + + err = (*filler)(data, pde->name, strlen(pde->name), pos, + bc_make_file_ino(pde), DT_REG); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + rcu_read_lock(); + prev = NULL; + ub = list_entry(&ub_list_head, struct user_beancounter, ub_list); + while (1) { + int len; + unsigned long ino; + char buf[64]; + + ub = list_entry(rcu_dereference(ub->ub_list.next), + struct user_beancounter, ub_list); + if (&ub->ub_list == &ub_list_head) + break; + + if (ub->parent != parent) + continue; + + if (filled++ < pos) + continue; + + if (!get_beancounter_rcu(ub)) + continue; + + rcu_read_unlock(); + put_beancounter(prev); + + len = print_ub_uid(ub, buf, sizeof(buf)); + ino = bc_make_ino(ub); + + err = (*filler)(data, buf, len, pos, ino, DT_DIR); + if (err < 0) { + err = 0; + put_beancounter(ub); + goto out; + } + + rcu_read_lock(); + prev = ub; + pos++; + } + rcu_read_unlock(); + put_beancounter(prev); +out: + file->f_pos = pos; + return err; +} + +static int bc_looktest(struct inode *ino, void *data) +{ + return ino->i_op == &bc_entry_iops && ino->i_private == data; +} + +static int bc_lookset(struct inode *ino, void *data) +{ + struct user_beancounter *ub; + + ub = (struct user_beancounter *)data; + ino->i_private = data; + ino->i_ino = bc_make_ino(ub); + ino->i_fop = &bc_entry_fops; + ino->i_op = &bc_entry_iops; + ino->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; + /* subbeancounters are not included, but who cares? */ + ino->i_nlink = num_entries + 2; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir, + struct dentry *dentry) +{ + struct inode *ino; + + ino = iget5_locked(dir->i_sb, ub->ub_uid, bc_looktest, bc_lookset, ub); + if (ino == NULL) + goto out_put; + + unlock_new_inode(ino); + dentry->d_op = &bc_dentry_ops; + dentry->d_fsdata = ub; + d_add(dentry, ino); + return NULL; + +out_put: + put_beancounter(ub); + return ERR_PTR(-ENOENT); +} + +/* + * files (bc_proc_entry) manipulations + */ + +static struct dentry *bc_lookup_file(struct inode *dir, + struct dentry *dentry, struct bc_proc_entry *root, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *)) +{ + struct bc_proc_entry *pde; + struct inode *ino; + + for (pde = root; pde != NULL; pde = pde->next) + if (strcmp(pde->name, dentry->d_name.name) == 0) + break; + + if (pde == NULL) + return ERR_PTR(-ESRCH); + + ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde); + if (ino == NULL) + return ERR_PTR(-ENOENT); + + unlock_new_inode(ino); + dentry->d_op = &bc_dentry_ops; + d_add(dentry, ino); + return NULL; +} + +static int bc_file_open(struct inode *ino, struct file *filp) +{ + struct bc_proc_entry *de; + struct user_beancounter *ub; + + de = (struct bc_proc_entry *)ino->i_private; + ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata; + BUG_ON(ub->ub_magic != UB_MAGIC); + + /* + * ub can't disappear: we hold d_parent, he holds the beancounter + */ + return single_open(filp, de->u.show, ub); +} + +static struct file_operations bc_file_ops = { + .open = bc_file_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int bc_looktest_entry(struct inode *ino, void *data) +{ + return ino->i_fop == &bc_file_ops && ino->i_private == data; +} + +static int bc_lookset_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + ino->i_private = data; + ino->i_ino = bc_make_file_ino(de); + ino->i_fop = &bc_file_ops, + ino->i_mode = S_IFREG | S_IRUSR; + ino->i_nlink = 1; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static inline struct dentry *bc_lookup_files(struct inode *dir, + struct dentry *de) +{ + return bc_lookup_file(dir, de, bc_entries, + bc_looktest_entry, bc_lookset_entry); +} + +static int bc_looktest_root_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + return ino->i_fop == de->u.fops && ino->i_private == data; +} + +static int bc_lookset_root_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + ino->i_private = data; + ino->i_ino = bc_make_file_ino(de); + ino->i_fop = de->u.fops; + ino->i_mode = S_IFREG | S_IRUSR; + ino->i_nlink = 1; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static inline struct dentry *bc_lookup_root_files(struct inode *dir, + struct dentry *de) +{ + return bc_lookup_file(dir, de, bc_root_entries, + bc_looktest_root_entry, bc_lookset_root_entry); +} + +/* + * /proc/bc/.../ directory operations + */ + +static int bc_entry_readdir(struct file *file, void *data, filldir_t filler) +{ + return bc_readdir(file, filler, data, + (struct user_beancounter *)file->f_dentry->d_fsdata); +} + +static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int id; + char *end; + struct user_beancounter *par, *ub; + struct dentry *de; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return ERR_PTR(-EPERM); + + de = bc_lookup_files(dir, dentry); + if (de != ERR_PTR(-ESRCH)) + return de; + + id = simple_strtol(dentry->d_name.name, &end, 10); + if (*end != '.') + return ERR_PTR(-ENOENT); + + par = (struct user_beancounter *)dir->i_private; + if (par->ub_uid != id) + return ERR_PTR(-ENOENT); + + id = simple_strtol(end + 1, &end, 10); + if (*end != '\0') + return ERR_PTR(-ENOENT); + + ub = get_subbeancounter_byid(par, id, 0); + if (ub == NULL) + return ERR_PTR(-ENOENT); + + return bc_lookup(ub, dir, dentry); +} + +static int bc_entry_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct user_beancounter *ub; + + generic_fillattr(dentry->d_inode, stat); + ub = (struct user_beancounter *)dentry->d_fsdata; + stat->nlink = ub->ub_childs + 2; + return 0; +} + +static struct file_operations bc_entry_fops = { + .read = generic_read_dir, + .readdir = bc_entry_readdir, +}; + +static struct inode_operations bc_entry_iops = { + .lookup = bc_entry_lookup, + .getattr = bc_entry_getattr, +}; + +/* + * /proc/bc directory operations + */ + +static int bc_root_readdir(struct file *file, void *data, filldir_t filler) +{ + return bc_readdir(file, filler, data, NULL); +} + +static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int id; + char *end; + struct user_beancounter *ub; + struct dentry *de; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return ERR_PTR(-EPERM); + + de = bc_lookup_root_files(dir, dentry); + if (de != ERR_PTR(-ESRCH)) + return de; + + id = simple_strtol(dentry->d_name.name, &end, 10); + if (*end != '\0') + return ERR_PTR(-ENOENT); + + ub = get_beancounter_byuid(id, 0); + if (ub == NULL) + return ERR_PTR(-ENOENT); + + return bc_lookup(ub, dir, dentry); +} + +static int bc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + generic_fillattr(dentry->d_inode, stat); + stat->nlink = ub_count + 2; + return 0; +} + +static struct file_operations bc_root_fops = { + .read = generic_read_dir, + .readdir = bc_root_readdir, +}; + +static struct inode_operations bc_root_iops = { + .lookup = bc_root_lookup, + .getattr = bc_root_getattr, +}; + +static int __init ub_init_proc(void) +{ + struct proc_dir_entry *entry; + + bc_proc_root = create_proc_entry("bc", + S_IFDIR | S_IRUSR | S_IXUSR, NULL); + if (bc_proc_root == NULL) + panic("Can't create /proc/bc entry"); + + bc_proc_root->proc_fops = &bc_root_fops; + bc_proc_root->proc_iops = &bc_root_iops; + + bc_register_proc_entry(&bc_resources_entry); +#ifdef CONFIG_UBC_DEBUG + bc_register_proc_entry(&bc_debug_entry); +#endif + bc_register_proc_root_entry(&bc_all_resources_entry); + + entry = proc_create("user_beancounters", + S_IRUSR, &glob_proc_root, &ub_file_operations); + return 0; +} + +core_initcall(ub_init_proc); diff -urNp linux-2.6.32.48/kernel/bc/rss_pages.c linux-2.6.32.48-openvz/kernel/bc/rss_pages.c --- linux-2.6.32.48/kernel/bc/rss_pages.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/rss_pages.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,454 @@ +/* + * kernel/bc/rss_pages.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct kmem_cache *pb_cachep; +spinlock_t pb_lock = SPIN_LOCK_UNLOCKED; +static struct page_beancounter **pb_hash_table; +static unsigned int pb_hash_mask; + +/* + * Auxiliary staff + */ + +static inline struct page_beancounter *next_page_pb(struct page_beancounter *p) +{ + return list_entry(p->page_list.next, struct page_beancounter, + page_list); +} + +static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p) +{ + return list_entry(p->page_list.prev, struct page_beancounter, + page_list); +} + +/* + * Held pages manipulation + */ +static inline void set_held_pages(struct user_beancounter *bc) +{ + /* all three depend on ub_held_pages */ + __ub_update_physpages(bc); + __ub_update_oomguarpages(bc); + __ub_update_privvm(bc); +} + +static inline void do_dec_held_pages(struct user_beancounter *ub, int value) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_held_pages -= value; + set_held_pages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static void dec_held_pages(struct user_beancounter *ub, int value) +{ + for (; ub != NULL; ub = ub->parent) + do_dec_held_pages(ub, value); +} + +static inline void do_inc_held_pages(struct user_beancounter *ub, int value) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_held_pages += value; + set_held_pages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static void inc_held_pages(struct user_beancounter *ub, int value) +{ + for (; ub != NULL; ub = ub->parent) + do_inc_held_pages(ub, value); +} + +/* + * ++ and -- beyond are protected with pb_lock + */ + +static inline void inc_pbc_count(struct user_beancounter *ub) +{ + for (; ub != NULL; ub = ub->parent) + ub->ub_pbcs++; +} + +static inline void dec_pbc_count(struct user_beancounter *ub) +{ + for (; ub != NULL; ub = ub->parent) + ub->ub_pbcs--; +} + +/* + * Alloc - free + */ + +inline int pb_alloc(struct page_beancounter **pbc) +{ + *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL); + if (*pbc != NULL) { + (*pbc)->next_hash = NULL; + (*pbc)->pb_magic = PB_MAGIC; + } + return (*pbc == NULL); +} + +inline void pb_free(struct page_beancounter **pb) +{ + if (*pb != NULL) { + kmem_cache_free(pb_cachep, *pb); + *pb = NULL; + } +} + +void pb_free_list(struct page_beancounter **p_pb) +{ + struct page_beancounter *list, *pb; + + list = *p_pb; + if (list == PBC_COPY_SAME) + return; + + while (list) { + pb = list; + list = list->next_hash; + pb_free(&pb); + } + *p_pb = NULL; +} + +/* + * head -> -> -> ... + */ +static int __alloc_list(struct page_beancounter **head, int num) +{ + struct page_beancounter *pb; + + while (num > 0) { + if (pb_alloc(&pb)) + return -1; + pb->next_hash = *head; + *head = pb; + num--; + } + + return num; +} + +/* + * Ensure that the list contains at least num elements. + * p_pb points to an initialized list, may be of the zero length. + * + * mm->page_table_lock should be held + */ +int pb_alloc_list(struct page_beancounter **p_pb, int num) +{ + struct page_beancounter *list; + + for (list = *p_pb; list != NULL && num; list = list->next_hash, num--); + if (!num) + return 0; + + /* + * *p_pb(after) *p_pb (before) + * \ \ + * -...-> -> ... + */ + if (__alloc_list(p_pb, num) < 0) + goto nomem; + return 0; + +nomem: + pb_free_list(p_pb); + return -ENOMEM; +} + +/* + * Allocates a page_beancounter for each + * user_beancounter in a hash + */ +int pb_alloc_all(struct page_beancounter **pbs) +{ + int need_alloc; + struct user_beancounter *ub; + + need_alloc = 0; + rcu_read_lock(); + for_each_beancounter(ub) + need_alloc++; + rcu_read_unlock(); + + if (!__alloc_list(pbs, need_alloc)) + return 0; + + pb_free_list(pbs); + return -ENOMEM; +} + +/* + * Hash routines + */ + +static inline int pb_hash(struct user_beancounter *ub, struct page *page) +{ + return (page_to_pfn(page) ^ ub->ub_cookie) & pb_hash_mask; +} + +/* pb_lock should be held */ +static inline void insert_pb(struct page_beancounter *p, struct page *page, + struct user_beancounter *ub, int hash) +{ + p->page = page; + p->ub = get_beancounter(ub); + p->next_hash = pb_hash_table[hash]; + pb_hash_table[hash] = p; + inc_pbc_count(ub); +} + +/* + * Heart + */ + +static int __pb_dup_ref(struct page *page, struct user_beancounter *bc, + int hash) +{ + struct page_beancounter *p; + + for (p = pb_hash_table[hash]; + p != NULL && (p->page != page || p->ub != bc); + p = p->next_hash); + if (p == NULL) + return -1; + + PB_COUNT_INC(p->refcount); + return 0; +} + +static void __pb_add_ref(struct page *page, struct user_beancounter *bc, + struct page_beancounter **ppb, int hash) +{ + struct page_beancounter *head, *p, **hp; + int shift; + + p = *ppb; + *ppb = p->next_hash; + + insert_pb(p, page, bc, hash); + hp = page_pblist(page); + head = *hp; + + if (head != NULL) { + /* + * Move the first element to the end of the list. + * List head (pb_head) is set to the next entry. + * Note that this code works even if head is the only element + * on the list (because it's cyclic). + */ + BUG_ON(head->pb_magic != PB_MAGIC); + *hp = next_page_pb(head); + PB_SHIFT_INC(head->refcount); + shift = PB_SHIFT_GET(head->refcount); + /* + * Update user beancounter, the share of head has been changed. + * Note that the shift counter is taken after increment. + */ + dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift); + /* add the new page beancounter to the end of the list */ + head = *hp; + list_add_tail(&p->page_list, &head->page_list); + } else { + *hp = p; + shift = 0; + INIT_LIST_HEAD(&p->page_list); + } + + p->refcount = PB_REFCOUNT_MAKE(shift, 1); + /* update user beancounter for the new page beancounter */ + inc_held_pages(bc, UB_PAGE_WEIGHT >> shift); +} + +void pb_add_ref(struct page *page, struct mm_struct *mm, + struct page_beancounter **p_pb) +{ + int hash; + struct user_beancounter *bc; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + if (__pb_dup_ref(page, bc, hash)) + __pb_add_ref(page, bc, p_pb, hash); + spin_unlock(&pb_lock); +} + +void pb_dup_ref(struct page *page, struct mm_struct *mm, + struct page_beancounter **p_pb) +{ + int hash; + struct user_beancounter *bc; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + if (*page_pblist(page) == NULL) + /* + * pages like ZERO_PAGE must not be accounted in pbc + * so on fork we just skip them + */ + goto out_unlock; + + if (unlikely(*p_pb != PBC_COPY_SAME)) + __pb_add_ref(page, bc, p_pb, hash); + else if (unlikely(__pb_dup_ref(page, bc, hash))) + WARN_ON(1); +out_unlock: + spin_unlock(&pb_lock); +} + +void pb_remove_ref(struct page *page, struct mm_struct *mm) +{ + int hash; + struct user_beancounter *bc; + struct page_beancounter *p, **q, *f; + int shift, shiftt; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + for (q = pb_hash_table + hash, p = *q; + p != NULL && (p->page != page || p->ub != bc); + q = &p->next_hash, p = *q); + if (p == NULL) + goto out_unlock; + + PB_COUNT_DEC(p->refcount); + if (PB_COUNT_GET(p->refcount)) + /* + * More references from the same user beancounter exist. + * Nothing needs to be done. + */ + goto out_unlock; + + /* remove from the hash list */ + f = p; + *q = p->next_hash; + + shift = PB_SHIFT_GET(p->refcount); + + dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift); + + q = page_pblist(page); + if (*q == p) { + if (list_empty(&p->page_list)) { + *q = NULL; + goto out_free; + } + + *q = next_page_pb(p); + } + list_del(&p->page_list); + + /* Now balance the list. Move the tail and adjust its shift counter. */ + p = prev_page_pb(*q); + shiftt = PB_SHIFT_GET(p->refcount); + *q = p; + PB_SHIFT_DEC(p->refcount); + + inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); + + /* + * If the shift counter of the moved beancounter is different from the + * removed one's, repeat the procedure for one more tail beancounter + */ + if (shiftt > shift) { + p = prev_page_pb(*q); + *q = p; + PB_SHIFT_DEC(p->refcount); + inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); + } +out_free: + dec_pbc_count(f->ub); + spin_unlock(&pb_lock); + + put_beancounter(f->ub); + pb_free(&f); + return; + +out_unlock: + spin_unlock(&pb_lock); +} + +struct user_beancounter *pb_grab_page_ub(struct page *page) +{ + struct page_beancounter *pb; + struct user_beancounter *ub; + + spin_lock(&pb_lock); + pb = *page_pblist(page); + ub = (pb == NULL ? ERR_PTR(-EINVAL) : + get_beancounter(pb->ub)); + spin_unlock(&pb_lock); + return ub; +} + +void __init ub_init_pbc(void) +{ + unsigned long hash_size; + + pb_cachep = kmem_cache_create("page_beancounter", + sizeof(struct page_beancounter), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + hash_size = num_physpages >> 2; + for (pb_hash_mask = 1; + (hash_size & pb_hash_mask) != hash_size; + pb_hash_mask = (pb_hash_mask << 1) + 1); + hash_size = pb_hash_mask + 1; + printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size); + pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *)); + memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *)); + + ub_init_io(pb_cachep); +} diff -urNp linux-2.6.32.48/kernel/bc/statd.c linux-2.6.32.48-openvz/kernel/bc/statd.c --- linux-2.6.32.48/kernel/bc/statd.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/statd.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,453 @@ +/* + * kernel/bc/statd.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(ubs_notify_list); +static long ubs_min_interval; +static ubstattime_t ubs_start_time, ubs_end_time; +static struct timer_list ubs_timer; + +static int ubstat_get_list(void __user *buf, long size) +{ + int retval; + struct user_beancounter *ub, *ubp; + long *page, *ptr, *end; + int len; + + page = (long *)__get_free_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + retval = 0; + ubp = NULL; + ptr = page; + end = page + PAGE_SIZE / sizeof(*ptr); + + spin_lock_irq(&ub_hash_lock); + for_each_beancounter(ub) { + if (ub->parent != NULL) + continue; + *ptr++ = ub->ub_uid; + if (ptr != end) + continue; + + get_beancounter(ub); + spin_unlock_irq(&ub_hash_lock); + + put_beancounter(ubp); + ubp = ub; + + len = min_t(long, (ptr - page) * sizeof(*ptr), size); + if (copy_to_user(buf, page, len)) { + retval = -EFAULT; + goto out_put; + } + retval += len; + if (len < PAGE_SIZE) + goto out_put; + buf += len; + size -= len; + + ptr = page; + end = page + PAGE_SIZE / sizeof(*ptr); + + spin_lock_irq(&ub_hash_lock); + } + spin_unlock_irq(&ub_hash_lock); + + put_beancounter(ubp); + size = min_t(long, (ptr - page) * sizeof(*ptr), size); + if (size > 0 && copy_to_user(buf, page, size)) { + retval = -EFAULT; + goto out_put; + } + retval += size; + +out_put: + put_beancounter(ubp); + free_page((unsigned long)page); + return retval; +} + +static int ubstat_gettime(void __user *buf, long size) +{ + ubgettime_t data; + int retval; + + spin_lock(&ubs_notify_lock); + data.start_time = ubs_start_time; + data.end_time = ubs_end_time; + data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ; + spin_unlock(&ubs_notify_lock); + + retval = min_t(long, sizeof(data), size); + if (copy_to_user(buf, &data, retval)) + retval = -EFAULT; + return retval; +} + +static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf) +{ + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparm_t param[1]; + } *data; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + + data->param[0].maxheld = ub->ub_store[res].maxheld; + data->param[0].failcnt = ub->ub_store[res].failcnt; + + return sizeof(*data); +} + +static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size) +{ + int wrote; + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparm_t param[UB_RESOURCES]; + } *data; + int resource; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + wrote = sizeof(data->start_time) + sizeof(data->end_time); + + for (resource = 0; resource < UB_RESOURCES; resource++) { + if (size < wrote + sizeof(data->param[resource])) + break; + data->param[resource].maxheld = ub->ub_store[resource].maxheld; + data->param[resource].failcnt = ub->ub_store[resource].failcnt; + wrote += sizeof(data->param[resource]); + } + + return wrote; +} + +static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf, + int size) +{ + int wrote; + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparmf_t param[UB_RESOURCES]; + } *data; + int resource; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + wrote = sizeof(data->start_time) + sizeof(data->end_time); + + for (resource = 0; resource < UB_RESOURCES; resource++) { + if (size < wrote + sizeof(data->param[resource])) + break; + /* The beginning of ubstatparmf_t matches struct ubparm. */ + memcpy(&data->param[resource], &ub->ub_store[resource], + sizeof(ub->ub_store[resource])); + data->param[resource].__unused1 = 0; + data->param[resource].__unused2 = 0; + wrote += sizeof(data->param[resource]); + } + return wrote; +} + +static int ubstat_get_stat(struct user_beancounter *ub, long cmd, + void __user *buf, long size) +{ + void *kbuf; + int retval; + + kbuf = (void *)__get_free_page(GFP_KERNEL); + if (kbuf == NULL) + return -ENOMEM; + + spin_lock(&ubs_notify_lock); + switch (UBSTAT_CMD(cmd)) { + case UBSTAT_READ_ONE: + retval = -EINVAL; + if (UBSTAT_PARMID(cmd) >= UB_RESOURCES) + break; + retval = ubstat_do_read_one(ub, + UBSTAT_PARMID(cmd), kbuf); + break; + case UBSTAT_READ_ALL: + retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE); + break; + case UBSTAT_READ_FULL: + retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE); + break; + default: + retval = -EINVAL; + } + spin_unlock(&ubs_notify_lock); + + if (retval > 0) { + retval = min_t(long, retval, size); + if (copy_to_user(buf, kbuf, retval)) + retval = -EFAULT; + } + + free_page((unsigned long)kbuf); + return retval; +} + +static int ubstat_handle_notifrq(ubnotifrq_t *req) +{ + int retval; + struct ub_stat_notify *new_notify; + struct list_head *entry; + struct task_struct *tsk_to_free; + + new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL); + if (new_notify == NULL) + return -ENOMEM; + + tsk_to_free = NULL; + INIT_LIST_HEAD(&new_notify->list); + + spin_lock(&ubs_notify_lock); + list_for_each(entry, &ubs_notify_list) { + struct ub_stat_notify *notify; + + notify = list_entry(entry, struct ub_stat_notify, list); + if (notify->task == current) { + kfree(new_notify); + new_notify = notify; + break; + } + } + + retval = -EINVAL; + if (req->maxinterval < 1) + goto out_unlock; + if (req->maxinterval > TIME_MAX_SEC) + req->maxinterval = TIME_MAX_SEC; + if (req->maxinterval < ubs_min_interval) { + unsigned long dif; + + ubs_min_interval = req->maxinterval; + dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ; + if (dif > req->maxinterval) + mod_timer(&ubs_timer, + ubs_timer.expires - + (dif - req->maxinterval) * HZ); + } + + if (entry != &ubs_notify_list) { + list_del(&new_notify->list); + tsk_to_free = new_notify->task; + } + if (req->signum) { + new_notify->task = current; + get_task_struct(new_notify->task); + new_notify->signum = req->signum; + list_add(&new_notify->list, &ubs_notify_list); + } else + kfree(new_notify); + retval = 0; +out_unlock: + spin_unlock(&ubs_notify_lock); + if (tsk_to_free != NULL) + put_task_struct(tsk_to_free); + return retval; +} + +/* + * former sys_ubstat + */ +long do_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size) +{ + int retval; + struct user_beancounter *ub; + + if (func == UBSTAT_UBPARMNUM) + return UB_RESOURCES; + if (func == UBSTAT_UBLIST) + return ubstat_get_list(buf, size); + if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))) + return -EPERM; + + if (func == UBSTAT_GETTIME) { + retval = ubstat_gettime(buf, size); + goto notify; + } + + ub = get_exec_ub(); + if (ub != NULL && ub->ub_uid == arg1) + get_beancounter(ub); + else /* FIXME must be if (ve_is_super) */ + ub = get_beancounter_byuid(arg1, 0); + + if (ub == NULL) + return -ESRCH; + + retval = ubstat_get_stat(ub, func, buf, size); + put_beancounter(ub); +notify: + /* Handle request for notification */ + if (retval >= 0) { + ubnotifrq_t notifrq; + int err; + + err = -EFAULT; + if (!copy_from_user(¬ifrq, (void __user *)arg2, + sizeof(notifrq))) + err = ubstat_handle_notifrq(¬ifrq); + if (err) + retval = err; + } + + return retval; +} + +static void ubstat_save_onestat(struct user_beancounter *ub) +{ + int resource; + + /* called with local irq disabled */ + spin_lock(&ub->ub_lock); + for (resource = 0; resource < UB_RESOURCES; resource++) { + memcpy(&ub->ub_store[resource], &ub->ub_parms[resource], + sizeof(struct ubparm)); + ub->ub_parms[resource].minheld = + ub->ub_parms[resource].maxheld = + ub->ub_parms[resource].held; + } + spin_unlock(&ub->ub_lock); +} + +static void ubstat_save_statistics(void) +{ + unsigned long flags; + struct user_beancounter *ub; + + local_irq_save(flags); + for_each_beancounter (ub) + ubstat_save_onestat(ub); + local_irq_restore(flags); +} + +static void ubstatd_timeout(unsigned long __data) +{ + struct task_struct *p; + + p = (struct task_struct *) __data; + wake_up_process(p); +} + +/* + * Safe wrapper for send_sig. It prevents a race with release_task + * for sighand. + * Should be called under tasklist_lock. + */ +static void task_send_sig(struct ub_stat_notify *notify) +{ + if (likely(notify->task->sighand != NULL)) + send_sig(notify->signum, notify->task, 1); +} + +static inline void do_notifies(void) +{ + LIST_HEAD(notif_free_list); + struct ub_stat_notify *notify; + struct ub_stat_notify *tmp; + + spin_lock(&ubs_notify_lock); + ubs_start_time = ubs_end_time; + /* + * the expression below relies on time being unsigned long and + * arithmetic promotion rules + */ + ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ; + mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ); + ubs_min_interval = TIME_MAX_SEC; + /* save statistics accumulated for the interval */ + ubstat_save_statistics(); + /* send signals */ + read_lock(&tasklist_lock); + while (!list_empty(&ubs_notify_list)) { + notify = list_entry(ubs_notify_list.next, + struct ub_stat_notify, list); + task_send_sig(notify); + list_del(¬ify->list); + list_add(¬ify->list, ¬if_free_list); + } + read_unlock(&tasklist_lock); + spin_unlock(&ubs_notify_lock); + + list_for_each_entry_safe(notify, tmp, ¬if_free_list, list) { + put_task_struct(notify->task); + kfree(notify); + } +} + +/* + * Kernel thread + */ +static int ubstatd(void *unused) +{ + /* daemonize call will take care of signals */ + daemonize("ubstatd"); + + ubs_timer.data = (unsigned long)current; + ubs_timer.function = ubstatd_timeout; + add_timer(&ubs_timer); + + while (1) { + set_task_state(current, TASK_INTERRUPTIBLE); + if (time_after(ubs_timer.expires, jiffies)) { + schedule(); + try_to_freeze(); + continue; + } + + __set_task_state(current, TASK_RUNNING); + do_notifies(); + } + return 0; +} + +static int __init ubstatd_init(void) +{ + init_timer(&ubs_timer); + ubs_timer.expires = TIME_MAX_JIF; + ubs_min_interval = TIME_MAX_SEC; + ubs_start_time = ubs_end_time = 0; + + kernel_thread(ubstatd, NULL, 0); + return 0; +} + +module_init(ubstatd_init); diff -urNp linux-2.6.32.48/kernel/bc/sys.c linux-2.6.32.48-openvz/kernel/bc/sys.c --- linux-2.6.32.48/kernel/bc/sys.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/sys.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,184 @@ +/* + * kernel/bc/sys.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include + +#include + +/* + * The (rather boring) getluid syscall + */ +SYSCALL_DEFINE0(getluid) +{ + struct user_beancounter *ub; + + ub = get_exec_ub(); + if (ub == NULL) + return -EINVAL; + + return ub->ub_uid; +} + +/* + * The setluid syscall + */ +SYSCALL_DEFINE1(setluid, uid_t, uid) +{ + struct user_beancounter *ub; + struct task_beancounter *task_bc; + int error; + + task_bc = ¤t->task_bc; + + /* You may not disown a setluid */ + error = -EINVAL; + if (uid == (uid_t)-1) + goto out; + + /* You may only set an ub as root */ + error = -EPERM; + if (!capable(CAP_SETUID)) + goto out; + /* + * The ub once set is irrevocable to all + * unless it's set from ve0. + */ + if (!ve_is_super(get_exec_env())) + goto out; + + /* Ok - set up a beancounter entry for this user */ + error = -ENOBUFS; + ub = get_beancounter_byuid(uid, 1); + if (ub == NULL) + goto out; + + ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) " + "for %.20s pid %d\n", + ub, atomic_read(&ub->ub_refcount), + current->comm, current->pid); + /* install bc */ + error = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_NEWUBC, ub); + if (!(error & NOTIFY_FAIL)) { + put_beancounter(task_bc->exec_ub); + task_bc->exec_ub = ub; + if (!(error & NOTIFY_OK)) { + put_beancounter(task_bc->fork_sub); + task_bc->fork_sub = get_beancounter(ub); + } + error = 0; + } else { + put_beancounter(ub); + error = -ENOBUFS; + } +out: + return error; +} + +long do_setublimit(uid_t uid, unsigned long resource, + unsigned long *new_limits) +{ + int error; + unsigned long flags; + struct user_beancounter *ub; + + error = -EPERM; + if(!capable(CAP_SYS_RESOURCE)) + goto out; + + if (!ve_is_super(get_exec_env())) + goto out; + + error = -EINVAL; + if (resource >= UB_RESOURCES) + goto out; + + error = -EINVAL; + if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE) + goto out; + + error = -ENOENT; + ub = get_beancounter_byuid(uid, 0); + if (ub == NULL) { + ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid); + goto out; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_parms[resource].barrier = new_limits[0]; + ub->ub_parms[resource].limit = new_limits[1]; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + put_beancounter(ub); + + error = 0; +out: + return error; +} + +/* + * The setbeanlimit syscall + */ +SYSCALL_DEFINE3(setublimit, uid_t, uid, unsigned long, resource, + unsigned long __user, *limits) +{ + unsigned long new_limits[2]; + + if (copy_from_user(&new_limits, limits, sizeof(new_limits))) + return -EFAULT; + + return do_setublimit(uid, resource, new_limits); +} + +extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size); + +SYSCALL_DEFINE5(ubstat, int, func, unsigned long, arg1, unsigned long, arg2, + void __user, *buf, long, size) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + return do_ubstat(func, arg1, arg2, buf, size); +} + +#ifdef CONFIG_COMPAT +#define UB_MAXVALUE_COMPAT ((1UL << (sizeof(compat_long_t) * 8 - 1)) - 1) + +asmlinkage long compat_sys_setublimit(uid_t uid, + compat_long_t resource, + compat_long_t __user *limits) +{ + compat_long_t u_new_limits[2]; + unsigned long new_limits[2]; + + if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits))) + return -EFAULT; + + new_limits[0] = u_new_limits[0]; + new_limits[1] = u_new_limits[1]; + + if (u_new_limits[0] == UB_MAXVALUE_COMPAT) + new_limits[0] = UB_MAXVALUE; + if (u_new_limits[1] == UB_MAXVALUE_COMPAT) + new_limits[1] = UB_MAXVALUE; + + return do_setublimit(uid, resource, new_limits); +} + +asmlinkage long compat_sys_ubstat(int func, unsigned int arg1, + unsigned int arg2, compat_uptr_t *buf, long size) +{ + return sys_ubstat(func, arg1, arg2, buf, size); +} +#endif diff -urNp linux-2.6.32.48/kernel/bc/vm_pages.c linux-2.6.32.48-openvz/kernel/bc/vm_pages.c --- linux-2.6.32.48/kernel/bc/vm_pages.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/bc/vm_pages.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,546 @@ +/* + * kernel/bc/vm_pages.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + do { + if (!pte_none(*pte) && pte_present(*pte)) + (*ret)++; + } while (pte++, addr += PAGE_SIZE, (addr != end)); + pte_unmap_unlock(pte - 1, ptl); + + return addr; +} + +static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + next = pages_in_pte_range(vma, pmd, addr, next, ret); + } while (pmd++, addr = next, (addr != end)); + + return addr; +} + +static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + next = pages_in_pmd_range(vma, pud, addr, next, ret); + } while (pud++, addr = next, (addr != end)); + + return addr; +} + +unsigned long pages_in_vma_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + unsigned long ret; + + ret = 0; + BUG_ON(addr >= end); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = pages_in_pud_range(vma, pgd, addr, next, &ret); + } while (pgd++, addr = next, (addr != end)); + return ret; +} + +void __ub_update_physpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages + + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT); + ub_adjust_maxheld(ub, UB_PHYSPAGES); +} + +void __ub_update_oomguarpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_OOMGUARPAGES].held = + ub->ub_parms[UB_PHYSPAGES].held + + ub->ub_parms[UB_SWAPPAGES].held; + ub_adjust_maxheld(ub, UB_OOMGUARPAGES); +} + +void __ub_update_privvm(struct user_beancounter *ub) +{ + ub->ub_parms[UB_PRIVVMPAGES].held = + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT) + + ub->ub_unused_privvmpages + + ub->ub_parms[UB_SHMPAGES].held; + ub_adjust_maxheld(ub, UB_PRIVVMPAGES); +} + +static inline int __charge_privvm_locked(struct user_beancounter *ub, + unsigned long s, enum ub_severity strict) +{ + if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0) + return -ENOMEM; + + ub->ub_unused_privvmpages += s; + return 0; +} + +static void __unused_privvm_dec_locked(struct user_beancounter *ub, + long size) +{ + /* catch possible overflow */ + if (ub->ub_unused_privvmpages < size) { + uncharge_warn(ub, UB_UNUSEDPRIVVM, + size, ub->ub_unused_privvmpages); + size = ub->ub_unused_privvmpages; + } + ub->ub_unused_privvmpages -= size; + __ub_update_privvm(ub); +} + +void __ub_unused_privvm_dec(struct mm_struct *mm, long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __unused_privvm_dec_locked(ub, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_unused_privvm_sub(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long count) +{ + if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) + __ub_unused_privvm_dec(mm, count); +} + +void ub_unused_privvm_add(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_unused_privvmpages += size; + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int ub_protected_charge(struct mm_struct *mm, unsigned long size, + unsigned long newflags, struct vm_area_struct *vma) +{ + unsigned long flags; + struct file *file; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return PRIVVM_NO_CHARGE; + + flags = vma->vm_flags; + if (!((newflags ^ flags) & VM_WRITE)) + return PRIVVM_NO_CHARGE; + + file = vma->vm_file; + if (!VM_UB_PRIVATE(newflags | VM_WRITE, file)) + return PRIVVM_NO_CHARGE; + + if (flags & VM_WRITE) + return PRIVVM_TO_SHARED; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_privvm_locked(ub, size, UB_SOFT) < 0) + goto err; + spin_unlock_irqrestore(&ub->ub_lock, flags); + return PRIVVM_TO_PRIVATE; + +err: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return PRIVVM_ERROR; +} + +int ub_memory_charge(struct mm_struct *mm, unsigned long size, + unsigned vm_flags, struct file *vm_file, int sv) +{ + struct user_beancounter *ub, *ubl; + unsigned long flags; + + ub = mm->mm_ub; + if (ub == NULL) + return 0; + + size >>= PAGE_SHIFT; + if (size > UB_MAXVALUE) + return -EINVAL; + + BUG_ON(sv != UB_SOFT && sv != UB_HARD); + + if (vm_flags & VM_LOCKED) { + if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv)) + goto out_err; + } + if (VM_UB_PRIVATE(vm_flags, vm_file)) { + ubl = top_beancounter(ub); + spin_lock_irqsave(&ubl->ub_lock, flags); + if (__charge_privvm_locked(ubl, size, sv)) + goto out_private; + spin_unlock_irqrestore(&ubl->ub_lock, flags); + } + return 0; + +out_private: + spin_unlock_irqrestore(&ubl->ub_lock, flags); + if (vm_flags & VM_LOCKED) + uncharge_beancounter(ub, UB_LOCKEDPAGES, size); +out_err: + return -ENOMEM; +} + +void ub_memory_uncharge(struct mm_struct *mm, unsigned long size, + unsigned vm_flags, struct file *vm_file) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + size >>= PAGE_SHIFT; + + if (vm_flags & VM_LOCKED) + uncharge_beancounter(ub, UB_LOCKEDPAGES, size); + if (VM_UB_PRIVATE(vm_flags, vm_file)) { + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __unused_privvm_dec_locked(ub, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); + } +} + +int ub_locked_charge(struct mm_struct *mm, unsigned long size) +{ + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return 0; + + return charge_beancounter(ub, UB_LOCKEDPAGES, + size >> PAGE_SHIFT, UB_HARD); +} + +void ub_locked_uncharge(struct mm_struct *mm, unsigned long size) +{ + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + +int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size) +{ + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return 0; + + return charge_beancounter(ub, UB_LOCKEDPAGES, + size >> PAGE_SHIFT, UB_HARD); +} + +void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size) +{ + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return; + + uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + + +static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_tmpfs_respages++; + __ub_update_physpages(ub); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_tmpfs_respages_inc(struct shmem_inode_info *shi) +{ + struct user_beancounter *ub; + + for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) + do_ub_tmpfs_respages_inc(ub); +} + +static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + /* catch possible overflow */ + if (ub->ub_tmpfs_respages < size) { + uncharge_warn(ub, UB_TMPFSPAGES, + size, ub->ub_tmpfs_respages); + size = ub->ub_tmpfs_respages; + } + ub->ub_tmpfs_respages -= size; + /* update values what is the most interesting */ + __ub_update_physpages(ub); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_tmpfs_respages_sub(struct shmem_inode_info *shi, + unsigned long size) +{ + struct user_beancounter *ub; + + for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) + do_ub_tmpfs_respages_sub(ub, size); +} + +int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size) +{ + int ret; + unsigned long flags; + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return 0; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD); + if (ret == 0) + __ub_update_privvm(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); + return ret; +} + +void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_SHMPAGES, size); + __ub_update_privvm(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +#ifdef CONFIG_BC_SWAP_ACCOUNTING +static inline void do_ub_swapentry_inc(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __charge_beancounter_locked(ub, UB_SWAPPAGES, 1, UB_FORCE); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num, + struct user_beancounter *ub) +{ + si->swap_ubs[num] = get_beancounter(ub); + for (; ub != NULL; ub = ub->parent) + do_ub_swapentry_inc(ub); +} +EXPORT_SYMBOL(ub_swapentry_inc); + +static inline void do_ub_swapentry_dec(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_SWAPPAGES, 1); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num) +{ + struct user_beancounter *ub, *ubp; + + ub = si->swap_ubs[num]; + si->swap_ubs[num] = NULL; + for (ubp = ub; ubp != NULL; ubp = ubp->parent) + do_ub_swapentry_dec(ubp); + put_beancounter(ub); +} +EXPORT_SYMBOL(ub_swapentry_dec); + +int ub_swap_init(struct swap_info_struct *si, pgoff_t num) +{ + struct user_beancounter **ubs; + + ubs = vmalloc(num * sizeof(struct user_beancounter *)); + if (ubs == NULL) + return -ENOMEM; + + memset(ubs, 0, num * sizeof(struct user_beancounter *)); + si->swap_ubs = ubs; + return 0; +} + +void ub_swap_fini(struct swap_info_struct *si) +{ + if (si->swap_ubs) { + vfree(si->swap_ubs); + si->swap_ubs = NULL; + } +} +#endif + +static int vmguar_enough_memory(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + struct user_beancounter *ub; + + if (event != VIRTINFO_ENOUGHMEM) + return old_ret; + /* + * If it's a kernel thread, don't care about it. + * Added in order aufsd to run smoothly over ramfs. + */ + if (!current->mm) + return NOTIFY_DONE; + + ub = top_beancounter(current->mm->mm_ub); + if (ub->ub_parms[UB_PRIVVMPAGES].held > + ub->ub_parms[UB_VMGUARPAGES].barrier) + return old_ret; + + return NOTIFY_OK; +} + +static struct vnotifier_block vmguar_notifier_block = { + .notifier_call = vmguar_enough_memory +}; + +static int __init init_vmguar_notifier(void) +{ + virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block); + return 0; +} + +static void __exit fini_vmguar_notifier(void) +{ + virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block); +} + +module_init(init_vmguar_notifier); +module_exit(fini_vmguar_notifier); + +#ifdef CONFIG_PROC_FS +static int bc_vmaux_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + unsigned long swap, unmap; + int i; + + ub = seq_beancounter(f); + + swap = unmap = 0; + for_each_online_cpu(i) { + swap += per_cpu_ptr(ub->ub_percpu, i)->swapin; + unmap += per_cpu_ptr(ub->ub_percpu, i)->unmap; + } + + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_UNUSEDPRIVVM], + ub->ub_unused_privvmpages); + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_TMPFSPAGES], + ub->ub_tmpfs_respages); + seq_printf(f, bc_proc_lu_fmt, "rss", ub->ub_pbcs); + + seq_printf(f, bc_proc_lu_fmt, "swapin", swap); + seq_printf(f, bc_proc_lu_fmt, "unmap", unmap); + return 0; +} +static struct bc_proc_entry bc_vmaux_entry = { + .name = "vmaux", + .u.show = bc_vmaux_show, +}; + +static int __init bc_vmaux_init(void) +{ + bc_register_proc_entry(&bc_vmaux_entry); + return 0; +} + +late_initcall(bc_vmaux_init); +#endif diff -urNp linux-2.6.32.48/kernel/cgroup.c linux-2.6.32.48-openvz/kernel/cgroup.c --- linux-2.6.32.48/kernel/cgroup.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cgroup.c 2011-11-21 17:40:47.000000000 -0500 @@ -2127,7 +2127,7 @@ static void cgroup_enable_task_cg_lists( struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); /* * We should check if the process is exiting, otherwise @@ -2137,7 +2137,7 @@ static void cgroup_enable_task_cg_lists( if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); write_unlock(&css_set_lock); } diff -urNp linux-2.6.32.48/kernel/cgroup_lite.c linux-2.6.32.48-openvz/kernel/cgroup_lite.c --- linux-2.6.32.48/kernel/cgroup_lite.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cgroup_lite.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,342 @@ +/* + * lite cgroups engine + */ + +#include +#include +#include +#include +#include +#include + +#define SUBSYS(_x) &_x ## _subsys, + +static struct cgroup_subsys *subsys[] = { +#include +}; + +static struct css_set init_css_set; +static struct cgroup init_cgroup; +static struct cftype *subsys_cftypes[CGROUP_SUBSYS_COUNT]; + +static struct idr cgroup_idr; +static DEFINE_SPINLOCK(cgroup_idr_lock); + +unsigned short css_id(struct cgroup_subsys_state *css) +{ + return css->cgroup->cgroup_lite_id; +} + +unsigned short css_depth(struct cgroup_subsys_state *css) +{ + return (css->cgroup == &init_cgroup) ? 0 : 1; +} + +int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) +{ + snprintf(buf, buflen, "/%d", cgrp->cgroup_lite_id); + return 0; +} + +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) +{ + struct cgroup *g; + + BUG_ON(!ss->use_id); + g = idr_find(&cgroup_idr, id); + if (!g) + return NULL; + return g->subsys[ss->subsys_id]; +} + +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) +{ +} + +static int init_cgroup_id(struct cgroup *g) +{ + int err, id; + + if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL))) + return -ENOMEM; + + spin_lock(&cgroup_idr_lock); + err = idr_get_new_above(&cgroup_idr, g, 1, &id); + spin_unlock(&cgroup_idr_lock); + + if (err) + return err; + + if (id > USHORT_MAX) { + spin_lock(&cgroup_idr_lock); + idr_remove(&cgroup_idr, id); + spin_unlock(&cgroup_idr_lock); + return -ENOSPC; + } + + g->cgroup_lite_id = id; + + return 0; +} + +static void fini_cgroup_id(struct cgroup *g) +{ + spin_lock(&cgroup_idr_lock); + idr_remove(&cgroup_idr, g->cgroup_lite_id); + spin_unlock(&cgroup_idr_lock); +} + +void __css_put(struct cgroup_subsys_state *css) +{ + atomic_dec(&css->refcnt); +} + +static int init_css_set_subsystems(struct cgroup *g, struct css_set *set) +{ + int i; + struct cgroup_subsys_state *ss; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *cs = subsys[i]; + + ss = cs->create(cs, g); + if (IS_ERR(ss)) + goto destroy; + + g->subsys[i] = ss; + set->subsys[i] = ss; + atomic_set(&ss->refcnt, 1); + ss->cgroup = g; + } + return 0; + +destroy: + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *cs = subsys[i]; + + if (g->subsys[i]) + cs->destroy(cs, g); + } + return PTR_ERR(ss); +} + +int init_ve_cgroups(struct ve_struct *ve) +{ + int err = -ENOMEM; + struct cgroup *g; + struct css_set *cs; + + g = kzalloc(sizeof(struct cgroup), GFP_KERNEL); + if (g == NULL) + goto err_galloc; + + cs = kzalloc(sizeof(struct css_set), GFP_KERNEL); + if (cs == NULL) + goto err_calloc; + + err = init_cgroup_id(g); + if (err) + goto err_id; + + g->parent = &init_cgroup; + err = init_css_set_subsystems(g, cs); + if (err) + goto err_subsys; + + g->parent = &init_cgroup; + ve->ve_cgroup = g; + ve->ve_css_set = cs; + return 0; + +err_subsys: + fini_cgroup_id(g); +err_id: + kfree(cs); +err_calloc: + kfree(g); +err_galloc: + return err; +} +EXPORT_SYMBOL(init_ve_cgroups); + +void fini_ve_cgroups(struct ve_struct *ve) +{ + int i; + struct cgroup *g = ve->ve_cgroup; + struct css_set *css = ve->ve_css_set; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *cs = subsys[i]; + struct cgroup_subsys_state *ss = css->subsys[i]; + + BUG_ON(ss != g->subsys[i]); + + if (cs->pre_destroy) + cs->pre_destroy(cs, g); + + if (atomic_read(&ss->refcnt) != 1) + printk(KERN_ERR "CG: leaking %d/%s subsys\n", + ve->veid, subsys[i]->name); + else + cs->destroy(cs, g); + } + + fini_cgroup_id(g); + kfree(g); + kfree(css); + ve->ve_cgroup = NULL; + ve->ve_css_set = NULL; +} +EXPORT_SYMBOL(fini_ve_cgroups); + +/* + * task lifecycle + */ + +void cgroup_fork(struct task_struct *child) +{ + child->cgroups = current->cgroups; +} + +void cgroup_fork_callbacks(struct task_struct *child) +{ +} + +void cgroup_post_fork(struct task_struct *child) +{ +} + +void cgroup_exit(struct task_struct *tsk, int dummy) +{ + tsk->cgroups = &init_css_set; +} + +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +{ + return -ENODATA; +} + +int cgroup_set_task_css(struct task_struct *tsk, struct css_set *css) +{ + int i, err; + struct cgroup_subsys *cs; + struct css_set *old_css; + + old_css = tsk->cgroups; + + if (old_css == css) + return 0; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + cs = subsys[i]; + if (!cs->can_attach) + continue; + err = cs->can_attach(cs, css->subsys[i]->cgroup, tsk, false); + if (err) + return err; + } + + tsk->cgroups = css; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + cs = subsys[i]; + if (!cs->attach) + continue; + cs->attach(cs, css->subsys[i]->cgroup, + old_css->subsys[i]->cgroup, tsk, false); + } + + return 0; +} +EXPORT_SYMBOL(cgroup_set_task_css); + +/* + * proc struts + */ + +static int proc_cgroup_show(struct seq_file *m, void *v) +{ + struct task_struct *tsk; + + tsk = pid_task((struct pid *)m->private, PIDTYPE_PID); + seq_printf(m, "%p\n", tsk->cgroups); + return 0; +} + +static int cgroup_open(struct inode *inode, struct file *file) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return single_open(file, proc_cgroup_show, PROC_I(inode)->pid); +} + +const struct file_operations proc_cgroup_operations = { + .open = cgroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * cgroups misc struts + */ + +int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + const struct cftype cft[], int count) +{ + int idx = subsys->subsys_id; + static DEFINE_SPINLOCK(add_files_lock); + + if (unlikely(subsys_cftypes[idx] == NULL)) { + spin_lock(&add_files_lock); + if (subsys_cftypes[idx] == NULL) + subsys_cftypes[idx] = (struct cftype *)cft; + spin_unlock(&add_files_lock); + } + + BUG_ON(subsys_cftypes[idx] != cft); + return 0; +} + +void cgroup_lock(void) +{ +} + +void cgroup_unlock(void) +{ +} + +bool cgroup_lock_live_group(struct cgroup *cg) +{ + return 1; +} + + +int cgroup_is_removed(const struct cgroup *cgrp) +{ + return 0; +} + +int __init cgroup_init_early(void) +{ + int i; + + init_task.cgroups = &init_css_set; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) + BUG_ON(subsys[i]->early_init); + + return 0; +} + +int __init cgroup_init(void) +{ + get_ve0()->ve_cgroup = &init_cgroup; + get_ve0()->ve_css_set = &init_css_set; + idr_init(&cgroup_idr); + if (init_cgroup_id(&init_cgroup)) + panic("CG: Can't init initial cgroup id\n"); + if (init_css_set_subsystems(&init_cgroup, &init_css_set) != 0) + panic("CG: Can't init initial set\n"); + return 0; +} diff -urNp linux-2.6.32.48/kernel/compat.c linux-2.6.32.48-openvz/kernel/compat.c --- linux-2.6.32.48/kernel/compat.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/compat.c 2011-11-21 17:40:47.000000000 -0500 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -101,7 +102,7 @@ int put_compat_timespec(const struct tim __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static long compat_nanosleep_restart(struct restart_block *restart) +long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; struct timespec rmt; @@ -123,6 +124,7 @@ static long compat_nanosleep_restart(str return ret; } +EXPORT_SYMBOL_GPL(compat_nanosleep_restart); asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, struct compat_timespec __user *rmtp) diff -urNp linux-2.6.32.48/kernel/cpt/cpt_conntrack.c linux-2.6.32.48-openvz/kernel/cpt/cpt_conntrack.c --- linux-2.6.32.48/kernel/cpt/cpt_conntrack.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_conntrack.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,365 @@ +/* + * + * kernel/cpt/cpt_conntrack.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + + +/* How does it work? + * + * Network is disabled, so new conntrack entries will not appear. + * However, some of them can disappear because of timeouts. + * + * So, we take read_lock, collect all required information atomically, + * essentially, creating parallel "refcount" structures holding pointers. + * We delete conntrack timers as well, so the structures cannot disappear + * after releasing the lock. Now, after releasing lock we can dump everything + * safely. And on exit we restore timers to their original values. + * + * Note, this approach is not going to work in VE0. + */ + +struct ct_holder +{ + struct ct_holder *next; + struct ip_conntrack_tuple_hash *cth; + int index; +}; + +static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple) +{ + v->cpt_dst = tuple->dst.ip; + v->cpt_dstport = tuple->dst.u.all; + v->cpt_protonum = tuple->dst.protonum; + v->cpt_dir = tuple->dst.dir; + + v->cpt_src = tuple->src.ip; + v->cpt_srcport = tuple->src.u.all; +} + +static int dump_one_expect(struct cpt_ip_connexpect_image *v, + struct ip_conntrack_expect *exp, + int sibling, cpt_context_t *ctx) +{ + int err = 0; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + encode_tuple(&v->cpt_tuple, &exp->tuple); + encode_tuple(&v->cpt_mask, &exp->mask); + v->cpt_sibling_conntrack = sibling; + v->cpt_flags = exp->flags; + v->cpt_seq = exp->id; + v->cpt_dir = 0; + v->cpt_manip_proto = 0; +#ifdef CONFIG_IP_NF_NAT_NEEDED + v->cpt_manip_proto = exp->saved_proto.all; + v->cpt_dir = exp->dir; +#endif + v->cpt_timeout = 0; + if (exp->master->helper->timeout) + v->cpt_timeout = exp->timeout.expires - jiffies; + return err; +} + +/* NOTE. We use one page to dump list of expectations. This may be not enough + * in theory. In practice there is only one expectation per conntrack record. + * Moreover, taking into account that _ALL_ of expecations are saved in one + * global list, which is looked up each incoming/outpging packet, the system + * would be severely dead when even one conntrack would have so much of + * expectations. Shortly, I am not going to repair this. + */ + +static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list, + cpt_context_t *ctx) +{ + int err = 0; + unsigned long pg; + struct cpt_ip_connexpect_image *v; + struct ip_conntrack_expect *exp; + + if (ct->expecting == 0) + return err; + if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE) + return -ENOBUFS; + + pg = __get_free_page(GFP_KERNEL); + if (!pg) + return -ENOMEM; + v = (struct cpt_ip_connexpect_image *)pg; + + read_lock_bh(&ip_conntrack_lock); + list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) { + int sibling; + + if (exp->master != ct) + continue; + + if (ct->helper == NULL) { + eprintk_ctx("conntrack: no helper and non-trivial expectation\n"); + err = -EINVAL; + break; + } + + sibling = 0; +#if 0 + /* That's all? No need to calculate sibling? */ + if (exp->sibling) { + struct ct_holder *c; + for (c = list; c; c = c->next) { + if (tuplehash_to_ctrack(c->cth) == exp->sibling) { + sibling = c->index; + break; + } + } + /* NOTE: exp->sibling could be not "confirmed" and, hence, + * out of hash table. We should just ignore such a sibling, + * the connection is going to be retried, the packet + * apparently was lost somewhere. + */ + if (sibling == 0) + dprintk_ctx("sibling conntrack is not found\n"); + } +#endif + + /* If the expectation still does not have exp->sibling + * and timer is not running, it is about to die on another + * cpu. Skip it. */ + if (!sibling && + ct->helper->timeout && + !timer_pending(&exp->timeout)) { + dprintk_ctx("conntrack: expectation: no timer\n"); + continue; + } + + err = dump_one_expect(v, exp, sibling, ctx); + if (err) + break; + + v++; + } + read_unlock_bh(&ip_conntrack_lock); + + if (err == 0 && (unsigned long)v != pg) + ctx->write((void*)pg, (unsigned long)v - pg, ctx); + + free_page(pg); + return err; +} + +static int dump_one_ct(struct ct_holder *c, struct ct_holder *list, + cpt_context_t *ctx) +{ + struct ip_conntrack_tuple_hash *h = c->cth; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + struct cpt_ip_conntrack_image v; + int err = 0; + + if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) { + eprintk_ctx("conntrack module ct->proto version mismatch\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_CONNTRACK; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + read_lock_bh(&ip_conntrack_lock); + v.cpt_status = ct->status; + v.cpt_timeout = ct->timeout.expires - jiffies; + v.cpt_ct_helper = (ct->helper != NULL); + v.cpt_index = c->index; + v.cpt_id = ct->id; + v.cpt_mark = 0; +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + v.cpt_mark = ct->mark; +#endif + encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple); + encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple); + memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data)); + memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data)); + + v.cpt_masq_index = 0; + v.cpt_initialized = 0; + v.cpt_num_manips = 0; + v.cpt_nat_helper = 0; +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + v.cpt_masq_index = ct->nat.masq_index; +#endif + /* "help" data is used by pptp, difficult to support */ + v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos; + v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before; + v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after; + v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos; + v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before; + v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after; +#endif + read_unlock_bh(&ip_conntrack_lock); + + ctx->write(&v, sizeof(v), ctx); + + err = dump_expect_list(ct, list, ctx); + + cpt_close_object(ctx); + return err; +} + +int cpt_dump_ip_conntrack(cpt_context_t * ctx) +{ + struct ct_holder *ct_list = NULL; + struct ct_holder *c, **cp; + int err = 0; + int index = 0; + int idx; + + if (get_exec_env()->_ip_conntrack == NULL) + return 0; + + for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) { + c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); + if (c == NULL) { + err = -ENOMEM; + goto done; + } + memset(c, 0, sizeof(struct ct_holder)); + c->next = ct_list; + ct_list = c; + } + + c = ct_list; + + read_lock_bh(&ip_conntrack_lock); + for (idx = 0; idx < ip_conntrack_htable_size; idx++) { + struct ip_conntrack_tuple_hash *h; + list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) { + /* Skip reply tuples, they are covered by original + * direction. */ + if (DIRECTION(h)) + continue; + + /* Oops, we have not enough of holders... + * It is impossible. */ + if (unlikely(c == NULL)) { + read_unlock_bh(&ip_conntrack_lock); + eprintk_ctx("unexpected conntrack appeared\n"); + err = -ENOMEM; + goto done; + } + + /* If timer is not running, it means that it + * has just been scheduled on another cpu. + * We should skip this conntrack, it is about to be + * destroyed. */ + if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) { + dprintk_ctx("conntrack: no timer\n"); + continue; + } + + /* Timer is deleted. refcnt is _not_ decreased. + * We are going to restore the timer on exit + * from this function. */ + c->cth = h; + c->index = ++index; + c = c->next; + } + } + read_unlock_bh(&ip_conntrack_lock); + + /* No conntracks? Good. */ + if (index == 0) + goto done; + + /* Comb the list a little. */ + cp = &ct_list; + while ((c = *cp) != NULL) { + /* Discard unused entries; they can appear, if some + * entries were timed out since we preallocated the list. + */ + if (c->cth == NULL) { + *cp = c->next; + kfree(c); + continue; + } + + /* Move conntracks attached to expectations to the beginning + * of the list. */ + if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) { + *cp = c->next; + c->next = ct_list; + ct_list = c; + dprintk_ctx("conntrack: %d moved in list\n", c->index); + continue; + } + cp = &c->next; + } + + cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK); + + for (c = ct_list; c; c = c->next) { + err = dump_one_ct(c, ct_list, ctx); + if (err) + goto done; + } + + cpt_close_section(ctx); + +done: + while ((c = ct_list) != NULL) { + ct_list = c->next; + if (c->cth) { + /* Restore timer. refcnt is preserved. */ + add_timer(&tuplehash_to_ctrack(c->cth)->timeout); + } + kfree(c); + } + return err; +} + +#endif diff -urNp linux-2.6.32.48/kernel/cpt/cpt_context.c linux-2.6.32.48-openvz/kernel/cpt/cpt_context.c --- linux-2.6.32.48/kernel/cpt/cpt_context.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_context.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,285 @@ +/* + * + * kernel/cpt/cpt_context.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + + +static void file_write(const void *addr, size_t count, struct cpt_context *ctx) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->write(file, addr, count, &file->f_pos); + set_fs(oldfs); + if (err != count && !ctx->write_error) + ctx->write_error = err < 0 ? err : -EIO; +} + +static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->write(file, addr, count, &pos); + set_fs(oldfs); + if (err != count && !ctx->write_error) + ctx->write_error = err < 0 ? err : -EIO; +} + +static void file_align(struct cpt_context *ctx) +{ + struct file *file = ctx->file; + + if (file) + file->f_pos = CPT_ALIGN(file->f_pos); +} + +static void cpt_push(loff_t *p, struct cpt_context *ctx) +{ + cpt_push_object(p, ctx); + cpt_open_object(NULL, ctx); +} + +static void cpt_pop(loff_t *p, struct cpt_context *ctx) +{ + cpt_close_object(ctx); + cpt_pop_object(p, ctx); +} + +static loff_t lookup_cpt_object_pos(int type, void *p, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + return obj->o_pos; +} + +struct cpt_ops cpt_ops = { + .write = file_write, + .push_object = cpt_push, + .pop_object = cpt_pop, + .lookup_object = lookup_cpt_object_pos, +}; + +void cpt_context_init(struct cpt_context *ctx) +{ + int i; + + memset(ctx, 0, sizeof(*ctx)); + + init_MUTEX(&ctx->main_sem); + ctx->refcount = 1; + + ctx->current_section = -1; + ctx->current_object = -1; + ctx->pagesize = PAGE_SIZE; + ctx->write = file_write; + ctx->pwrite = file_pwrite; + ctx->align = file_align; + for (i=0; i < CPT_SECT_MAX; i++) + ctx->sections[i] = CPT_NULL; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + init_completion(&ctx->pgin_notify); +#endif + cpt_object_init(ctx); +} + +int cpt_open_dumpfile(struct cpt_context *ctx) +{ + ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); + if (ctx->tmpbuf == NULL) + return -ENOMEM; + __cpt_release_buf(ctx); + return 0; +} + +int cpt_close_dumpfile(struct cpt_context *ctx) +{ + if (ctx->file) { + fput(ctx->file); + ctx->file = NULL; + } + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } + if (ctx->write_error) + eprintk_ctx("error while writing dump file: %d\n", ctx->write_error); + return ctx->write_error; +} + +int cpt_major_hdr_out(struct cpt_context *ctx) +{ + struct cpt_major_hdr hdr; + + if (ctx->file == NULL) + return 0; + + memset(&hdr, 0, sizeof(hdr)); + hdr.cpt_signature[0] = CPT_SIGNATURE0; + hdr.cpt_signature[1] = CPT_SIGNATURE1; + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_image_version = CPT_CURRENT_VERSION; +#ifdef CONFIG_X86_64 + hdr.cpt_os_arch = CPT_OS_ARCH_EMT64; +#elif defined(CONFIG_X86_32) + hdr.cpt_os_arch = CPT_OS_ARCH_I386; +#elif defined(CONFIG_IA64) + hdr.cpt_os_arch = CPT_OS_ARCH_IA64; +#else +#error Arch is not supported +#endif + hdr.cpt_ve_features = (__u32)ctx->features; + hdr.cpt_ve_features2 = (__u32)(ctx->features>>32); + hdr.cpt_pagesize = (__u16)PAGE_SIZE; + hdr.cpt_hz = HZ; + hdr.cpt_start_jiffies64 = ctx->virt_jiffies64; + hdr.cpt_start_sec = ctx->start_time.tv_sec; + hdr.cpt_start_nsec = ctx->start_time.tv_nsec; + hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags; + hdr.cpt_kernel_config[0] = ctx->kernel_config_flags; + hdr.cpt_iptables_mask = ctx->iptables_mask; + + ctx->write(&hdr, sizeof(hdr), ctx); + return 0; +} + +int cpt_close_section(struct cpt_context *ctx) +{ + if (ctx->file && ctx->current_section >= 0) { + __u64 next = ctx->file->f_pos - ctx->current_section; + ctx->pwrite(&next, 8, ctx, ctx->current_section); + ctx->current_section = -1; + } + return 0; +} +EXPORT_SYMBOL(cpt_close_section); + +int cpt_open_section(struct cpt_context *ctx, __u32 type) +{ + struct cpt_section_hdr hdr; + + if (ctx->file == NULL) + return 0; + + cpt_close_section(ctx); + + ctx->current_section = ctx->file->f_pos; + ctx->sections[type] = ctx->current_section; + + hdr.cpt_next = 0; + hdr.cpt_section = type; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_align = 0; + ctx->write(&hdr, sizeof(hdr), ctx); + + return 0; +} +EXPORT_SYMBOL(cpt_open_section); + + +int cpt_close_object(struct cpt_context *ctx) +{ + if (ctx->file && ctx->current_object >= 0) { + __u64 next = ctx->file->f_pos - ctx->current_object; + ctx->pwrite(&next, 8, ctx, ctx->current_object); + ctx->current_object = -1; + } + return 0; +} +EXPORT_SYMBOL(cpt_close_object); + +int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx) +{ + if (ctx->file == NULL) + return 0; + + cpt_close_object(ctx); + + ctx->current_object = ctx->file->f_pos; + if (obj) + cpt_obj_setpos(obj, ctx->current_object, ctx); + + return 0; +} +EXPORT_SYMBOL(cpt_open_object); + +int cpt_push_object(loff_t *saved, struct cpt_context *ctx) +{ + if (ctx->file) { + *saved = ctx->current_object; + ctx->current_object = ctx->file->f_pos; + } + return 0; +} +EXPORT_SYMBOL(cpt_push_object); + +int cpt_pop_object(loff_t *saved, struct cpt_context *ctx) +{ + ctx->current_object = *saved; + return 0; +} +EXPORT_SYMBOL(cpt_pop_object); + +int cpt_dump_tail(struct cpt_context *ctx) +{ + struct cpt_major_tail hdr; + int i; + + if (ctx->file == NULL) + return 0; + + cpt_open_section(ctx, CPT_SECT_TRAILER); + memset(&hdr, 0, sizeof(hdr)); + hdr.cpt_next = sizeof(hdr); + hdr.cpt_object = CPT_OBJ_TRAILER; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = CPT_CONTENT_VOID; + hdr.cpt_lazypages = 0; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + hdr.cpt_lazypages = ctx->lazypages; +#endif + hdr.cpt_64bit = ctx->tasks64; + hdr.cpt_signature[0] = CPT_SIGNATURE0; + hdr.cpt_signature[1] = CPT_SIGNATURE1; + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_nsect = CPT_SECT_MAX_INDEX; + for (i = 0; i < CPT_SECT_MAX_INDEX; i++) + hdr.cpt_sections[i] = ctx->sections[i]; + + ctx->write(&hdr, sizeof(hdr), ctx); + cpt_close_section(ctx); + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_context.h linux-2.6.32.48-openvz/kernel/cpt/cpt_context.h --- linux-2.6.32.48/kernel/cpt/cpt_context.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_context.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,225 @@ +#include +#include +#include + +#define CPT_CTX_ERROR -1 +#define CPT_CTX_IDLE 0 +#define CPT_CTX_SUSPENDING 1 +#define CPT_CTX_SUSPENDED 2 +#define CPT_CTX_DUMPING 3 +#define CPT_CTX_UNDUMPING 4 +#define CPT_CTX_UNDUMPED 5 + +#define CPT_TID(tsk) task_pid_nr(tsk), task_pid_vnr(tsk), (tsk)->comm +#define CPT_FID "%d,%d(%s)" + + +typedef struct cpt_context +{ + struct list_head ctx_list; + int refcount; + int ctx_state; + int objcount; + int sticky; + struct semaphore main_sem; + + struct file *errorfile; + struct file *statusfile; + struct file *lockfile; + + int errno; + char *error_msg; + loff_t err_offset; + + struct file *file; + char *tmpbuf; + int pagesize; +#ifdef CONFIG_VZ_CHECKPOINT_ITER + int iter_done; + void *iter_dir; + struct user_beancounter *iter_ub; +#endif + loff_t current_section; + loff_t current_object; + + loff_t sections[CPT_SECT_MAX]; + + __u32 errormask; + __u32 write_error; + + struct list_head object_array[CPT_OBJ_MAX]; + + void (*write)(const void *addr, size_t count, struct cpt_context *ctx); + void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); + ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx); + ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); + void (*align)(struct cpt_context *ctx); + int ve_id; + int contextid; + struct timespec cpt_monotonic_time; /* Host monotonic time at the moment of cpt/rst + * corresponging to start_time */ + __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when + * VE did not migrate. */ + struct timespec start_time; + struct timespec delta_time; + __s64 delta_nsec; + int image_version; + __u16 image_arch; + __u64 iptables_mask; + __u64 features; + +#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9) +#define CPT_ANONVMA_HSIZE (1<ve_id, ##arg) + +#define wprintk(a...) cpt_printk(2, "CPT WRN: " a) +#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) + +#define eprintk(a...) cpt_printk(1, "CPT ERR: " a) +#define eprintk_ctx(f, arg...) \ +do { \ + eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \ + if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \ + ctx->err_offset += snprintf((char*)(ctx->error_msg + \ + ctx->err_offset), \ + PAGE_SIZE - ctx->err_offset, \ + "Error: " f, ##arg); \ +} while(0) + +#define CPT_TMPBUF_FREE 0x789adf12 +#define CPT_TMPBUF_BUSY 0xabcd9876 + +static inline void *cpt_get_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE); + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY; + return buf; +} + +static inline void __cpt_release_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; +} + +static inline void cpt_release_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY); + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; +} + +static inline void cpt_flush_error(cpt_context_t *ctx) +{ + mm_segment_t oldfs; + + if (ctx->errorfile && ctx->error_msg && ctx->err_offset) { + if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) { + oldfs = get_fs(); + set_fs(KERNEL_DS); + ctx->errorfile->f_op->write(ctx->errorfile, + ctx->error_msg, ctx->err_offset, + &ctx->errorfile->f_pos); + set_fs(oldfs); + } + ctx->error_msg[0] = 0; + ctx->err_offset = 0; + } +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_dump.c linux-2.6.32.48-openvz/kernel/cpt/cpt_dump.c --- linux-2.6.32.48/kernel/cpt/cpt_dump.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_dump.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,1271 @@ +/* + * + * kernel/cpt/cpt_dump.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_net.h" +#include "cpt_socket.h" +#include "cpt_ubc.h" +#include "cpt_kernel.h" + + +static int vps_child_level(struct task_struct *root, struct task_struct *c) +{ + int level = 0; + int veid = VE_TASK_INFO(c)->owner_env->veid; + + while (VE_TASK_INFO(c)->owner_env->veid == veid) { + if (c->pid != c->tgid) + c = c->group_leader; + if (c == root) + return level; + + c = c->parent; + level++; + } + return -1; +} + +static inline int freezable(struct task_struct * p) +{ + if (p->exit_state) + return 0; + + switch (p->state) { + case EXIT_ZOMBIE: + case EXIT_DEAD: + case TASK_STOPPED: +#if TASK_TRACED != TASK_STOPPED + case TASK_TRACED: +#endif + return 0; + default: + return 1; + } +} + +static void wake_ve(cpt_context_t *ctx) +{ + struct task_struct *p, *g; + + do_each_thread_ve(g, p) { + spin_lock_irq(&p->sighand->siglock); + if (p->flags & PF_FROZEN) { + p->flags &= ~PF_FROZEN; + wake_up_process(p); + } + spin_unlock_irq(&p->sighand->siglock); + } while_each_thread_ve(g, p); +} + +/* + * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE... + * + * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context + * of another process. Apparently, it is unacceptable on SMP. + * Let's take freeze_processes() in kernel/power/process.c as an example. + * Unserialized modifications tsk->flags easily + * (believe or not, but it happens with probability of almost 100% :-)) + * creates the situation when setting PF_FREEZE in freeze_processes(), + * which quickly spins raising PF_FREEZE of all the processes, + * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks. + * + * So, to make things clean, we require that those flags may be modified + * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE + * is just a kind of signal. + * + * It is not enough, because we are still not allowed to change tsk->flags + * in context of another process, we can corrupt another flags, when the process + * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags, + * which can be changed atomically. + * + * PF_FROZEN also changes in context of another process, but this happens + * only when the process is already in refrigerator() which does not modify + * tsk->flags. + */ + +static int check_process_external(struct task_struct *p) +{ + if (pid_alive(p)) { + if (p->pids[PIDTYPE_PID].pid->level == 0) + return PIDTYPE_PID; + if (p->pids[PIDTYPE_PGID].pid->level == 0) + return PIDTYPE_PGID; + if (p->pids[PIDTYPE_SID].pid->level == 0) + return PIDTYPE_SID; + } + + return PIDTYPE_MAX; +} + +enum +{ + OBSTACLE_NOGO = -1, + OBSTACLE_TIMEOUT = -2, + OBSTACLE_TRYAGAIN = -3, +}; + +#define SUSPEND_TIMEOUT (10UL*HZ) + +static int vps_stop_tasks(struct cpt_context *ctx) +{ + unsigned long start_time = jiffies; + unsigned long target, timeout; + struct task_struct *p, *g; + int todo; + int round = 0; + + do_gettimespec(&ctx->start_time); + do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); + ctx->virt_jiffies64 = get_jiffies_64() + get_exec_env()->jiffies_fixup; + + read_lock(&tasklist_lock); + + atomic_inc(&get_exec_env()->suspend); + timeout = HZ/5; + target = jiffies + timeout; + + for(;;) { + struct task_struct *root; + todo = 0; + + root = find_task_by_vpid(1); + if (!root) { + read_unlock(&tasklist_lock); + eprintk_ctx("cannot find ve init\n"); + atomic_dec(&get_exec_env()->suspend); + return -ESRCH; + } + + do_each_thread_ve(g, p) { + if (vps_child_level(root, p) >= 0) { + switch (check_process_external(p)) { + case PIDTYPE_PID: + eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", + task_pid_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + case PIDTYPE_PGID: + eprintk_ctx("external process group %d/%d(%s) inside CT " + "(e.g. vzctl enter or vzctl exec).\n", + task_pgrp_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + case PIDTYPE_SID: + eprintk_ctx("external process session %d/%d(%s) inside CT " + "(e.g. vzctl enter or vzctl exec).\n", + task_session_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + } + if (p->vfork_done) { + /* Task between vfork()...exec() + * cannot be frozen, because parent + * wait in uninterruptible state. + * So, we do nothing, waiting for + * exec(), unless: + */ + if (p->state == TASK_STOPPED || + p->state == TASK_TRACED) { + eprintk_ctx("task " CPT_FID " is stopped while vfork(). " + "Checkpointing is impossible.\n", + CPT_TID(p)); + todo = OBSTACLE_NOGO; + /* It is fatal, _user_ stopped + * vfork()ing task, so that we + * cannot suspend now. + */ + } else { + todo = OBSTACLE_TRYAGAIN; + } + goto out; + } + if (p->signal->group_exit_task && + p->signal->notify_count) { + /* exec() waits for threads' death */ + wprintk_ctx("task " CPT_FID " waits for threads' death\n", CPT_TID(p)); + todo = OBSTACLE_TRYAGAIN; + goto out; + } + if (p->state == TASK_TRACED +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) + && !p->stopped_state +#endif + ) { + int ptrace_id = p->pn_state; + /* Debugger waits for signal. */ + switch (ptrace_id) { + case PN_STOP_TF: + case PN_STOP_TF_RT: + case PN_STOP_ENTRY: + case PN_STOP_FORK: + case PN_STOP_VFORK: + case PN_STOP_SIGNAL: + case PN_STOP_EXIT: + case PN_STOP_LEAVE: + break; + default: + eprintk_ctx("task " CPT_FID " is stopped by debugger while %d.\n", CPT_TID(p), ptrace_id); + todo = OBSTACLE_NOGO; + goto out; + } + } +#ifdef CONFIG_UTRACE + if (check_utrace(p, root, ctx)) { + eprintk_ctx("task " CPT_FID " is utraced. Checkpointing is impossible.\n", CPT_TID(p)); + todo = OBSTACLE_NOGO; + goto out; + } +#endif + if (p->flags & PF_NOFREEZE) { + eprintk_ctx("task " CPT_FID " is unfreezable. Checkpointing is impossible.\n", CPT_TID(p)); + todo = OBSTACLE_NOGO; + goto out; + } + + if (!freezable(p)) + continue; + + spin_lock_irq(&p->sighand->siglock); + if (!(p->flags & PF_FROZEN)) { + set_tsk_thread_flag(p, TIF_FREEZE); + signal_wake_up(p, 0); + } + spin_unlock_irq(&p->sighand->siglock); + + if (p->flags & PF_FROZEN) { + if (p->state != TASK_UNINTERRUPTIBLE) + printk("Holy Crap 1 %ld " CPT_FID "\n", p->state, CPT_TID(p)); + continue; + } + + if (round == 10) + wprintk_ctx(CPT_FID " is running\n", CPT_TID(p)); + + todo++; + } else { + if (p != current) { + eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", + task_pid_vnr(p), task_pid_nr(p), p->comm); + todo = OBSTACLE_NOGO; + goto out; + } + } + } while_each_thread_ve(g, p); + + if (todo > 0) { + /* No visible obstacles, but VE did not freeze + * for timeout. Interrupt suspend, if it is major + * timeout or signal; if it is minor timeout + * we will wake VE and restart suspend. + */ + if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) + || signal_pending(current)) + todo = OBSTACLE_TIMEOUT; + else if (time_after(jiffies, target)) + todo = OBSTACLE_TRYAGAIN; + } + +out: + if (todo < 0) { + atomic_dec(&get_exec_env()->suspend); + + wake_ve(ctx); + +#if 0 + /* This is sign of failure of printk(), which is not + * ours. So, no prefixes. */ + printk(">\n"); +#endif + } + + read_unlock(&tasklist_lock); + + if (!todo) { + atomic_dec(&get_exec_env()->suspend); + return 0; + } + + switch (todo) { + case OBSTACLE_NOGO: + eprintk_ctx("suspend is impossible now.\n"); + return -EAGAIN; + + case OBSTACLE_TIMEOUT: + eprintk_ctx("interrupted or timed out.\n"); + return -EINTR; + + case OBSTACLE_TRYAGAIN: + if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) || + signal_pending(current)) { + wprintk_ctx("suspend timed out\n"); + return -EAGAIN; + } + + wprintk_ctx("minor suspend timeout (%lu) expired, " + "trying again\n", timeout); + + /* Try again. VE is awake, give it some time to run. */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ); + + /* After a short wait restart suspend + * with longer timeout */ + atomic_inc(&get_exec_env()->suspend); + timeout = min(timeout<<1, SUSPEND_TIMEOUT); + target = jiffies + timeout; + break; + + default: + if (round > 0) { + /* VE is partially frozen, give processes + * a chance to enter to refrigerator(). */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/20); + } else { + yield(); + } + } + + read_lock(&tasklist_lock); + round++; + } +} + +static int cpt_unlock_ve(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + down_write(&env->op_sem); + env->is_locked = 0; + up_write(&env->op_sem); + put_ve(env); + return 0; +} + +int cpt_resume(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); + + cpt_unlock_sockets(ctx); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) { + wait_for_completion(&ctx->pgin_notify); + put_task_struct(ctx->pgin_task); + ctx->pgin_task = NULL; + } +#endif + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->flags & PF_FROZEN) { + tsk->flags &= ~PF_FROZEN; + wake_up_process(tsk); + } else if (freezable(tsk)) { + eprintk_ctx("strange, %s not frozen\n", tsk->comm ); + } + spin_unlock_irq(&tsk->sighand->siglock); + put_task_struct(tsk); + } + + cpt_resume_network(ctx); + + cpt_unlock_ve(ctx); + + cpt_finish_ubc(ctx); + cpt_object_destroy(ctx); + return 0; +} + +int cpt_kill(struct cpt_context *ctx) +{ + int err = 0; + struct ve_struct *env; + cpt_object_t *obj; + struct task_struct *root_task = NULL; + long delay; + struct cred *c; + + if (!ctx->ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + + c = prepare_creds(); + if (c == NULL) { + put_ve(env); + return -ENOMEM; + } + + /* from here cpt_kill succeeds */ + virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); + + if (current->ve_task_info.owner_env == env) { + wprintk_ctx("attempt to kill ve from inside, escaping...\n"); + ve_move_task(current, get_ve0(), c); + } else + abort_creds(c); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) { + wait_for_completion(&ctx->pgin_notify); + put_task_struct(ctx->pgin_task); + ctx->pgin_task = NULL; + } +#endif + + cpt_kill_sockets(ctx); + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + if (tsk->exit_state) { + put_task_struct(tsk); + continue; + } + + if (task_pid_vnr(tsk) == 1) { + root_task = tsk; + continue; + } + + tsk->robust_list = NULL; +#ifdef CONFIG_COMPAT + tsk->compat_robust_list = NULL; +#endif + tsk->clear_child_tid = NULL; + + if (tsk->ptrace) { + write_lock_irq(&tasklist_lock); + tsk->ptrace = 0; + if (!list_empty(&tsk->ptrace_entry)) { + list_del_init(&tsk->ptrace_entry); + /* + * This code used to be here: + * remove_parent(tsk); + * tsk->parent = tsk->parent; + * add_parent(tsk); + */ + } + write_unlock_irq(&tasklist_lock); + } + + send_sig(SIGKILL, tsk, 1); + + spin_lock_irq(&tsk->sighand->siglock); + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(tsk, TIF_SIGPENDING); + if (tsk->flags & PF_FROZEN) + tsk->flags &= ~PF_FROZEN; + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + put_task_struct(tsk); + } + + yield(); + + if (root_task != NULL) { + send_sig(SIGKILL, root_task, 1); + + spin_lock_irq(&root_task->sighand->siglock); + sigfillset(&root_task->blocked); + sigdelsetmask(&root_task->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(root_task, TIF_SIGPENDING); + clear_tsk_thread_flag(root_task, TIF_FREEZE); + if (root_task->flags & PF_FROZEN) + root_task->flags &= ~PF_FROZEN; + spin_unlock_irq(&root_task->sighand->siglock); + + wake_up_process(root_task); + put_task_struct(root_task); + } + + cpt_finish_ubc(ctx); + cpt_object_destroy(ctx); + + delay = 1; + while (atomic_read(&env->counter) != 1) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + delay = (delay < HZ) ? (delay << 1) : HZ; + schedule_timeout(delay); + } + put_ve(env); + + return err; +} + +#ifdef CONFIG_BEANCOUNTERS +static void collect_task_ubc(struct task_struct *t, struct cpt_context *ctx) +{ + struct task_beancounter *tbc; + + tbc = &(t->task_bc); + cpt_add_ubc(tbc->exec_ub, ctx); + cpt_add_ubc(tbc->task_ub, ctx); + cpt_add_ubc(tbc->fork_sub, ctx); +} +#else +static void inline collect_task_ubc(struct task_struct *t, + struct cpt_context *ctx) +{ return; } +#endif + +static cpt_object_t * remember_task(struct task_struct * child, + cpt_object_t * head, cpt_context_t * ctx) +{ + cpt_object_t *cobj; + + if (freezable(child) && !(child->flags&PF_FROZEN)) { + eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child)); + put_task_struct(child); + return NULL; + } + + if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG(); + if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { + put_task_struct(child); + return NULL; + } + cobj->o_count = 1; + cpt_obj_setobj(cobj, child, ctx); + insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx); + collect_task_ubc(child, ctx); + return cobj; +} + +static int vps_collect_tasks(struct cpt_context *ctx) +{ + int err = -ESRCH; + cpt_object_t *obj; + struct task_struct *root; + read_lock(&tasklist_lock); + root = find_task_by_vpid(1); + if (root) + get_task_struct(root); + read_unlock(&tasklist_lock); + + if (!root) { + err = -ESRCH; + eprintk_ctx("vps_collect_tasks: cannot find root\n"); + goto out; + } + + if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { + put_task_struct(root); + return -ENOMEM; + } + obj->o_count = 1; + cpt_obj_setobj(obj, root, ctx); + intern_cpt_object(CPT_OBJ_TASK, obj, ctx); + collect_task_ubc(root, ctx); + + /* Collect process subtree recursively */ + for_each_object(obj, CPT_OBJ_TASK) { + cpt_object_t *head = obj; + struct task_struct *tsk = obj->o_obj; + struct task_struct *child; + + if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) { + eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk)); + err = -EINVAL; + goto out; + } + + if (tsk->state == TASK_RUNNING) + printk("Holy Crap 2 %ld " CPT_FID "\n", tsk->state, CPT_TID(tsk)); + + wait_task_inactive(tsk, 0); + + err = check_task_state(tsk, ctx); + if (err) + goto out; + + if (tsk->pid == tsk->tgid) { + child = tsk; + for (;;) { + read_lock(&tasklist_lock); + child = next_thread(child); + if (child != tsk) + get_task_struct(child); + read_unlock(&tasklist_lock); + + if (child == tsk) + break; + + if (child->parent != tsk->parent) { + put_task_struct(child); + eprintk_ctx("illegal thread structure, kernel bug\n"); + err = -EINVAL; + goto out; + } + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + } + } + + /* About locking. VE is frozen. But lists of children + * may change at least for init, when entered task reparents + * to init and when reparented task exits. If we take care + * of this case, we still can unlock while scanning + * tasklists. + */ + read_lock(&tasklist_lock); + list_for_each_entry(child, &tsk->children, sibling) { + if (child->parent != tsk) + continue; + if (child->pid != child->tgid) + continue; + get_task_struct(child); + read_unlock(&tasklist_lock); + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + } + + list_for_each_entry(child, &tsk->ptraced, ptrace_entry) { + if (child->parent != tsk) + continue; + if (child->pid != child->tgid) + continue; + get_task_struct(child); + read_unlock(&tasklist_lock); + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + } + read_unlock(&tasklist_lock); + } + + return 0; + +out: + while (!list_empty(&ctx->object_array[CPT_OBJ_TASK])) { + struct list_head *head = ctx->object_array[CPT_OBJ_TASK].next; + cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); + struct task_struct *tsk; + + list_del(head); + tsk = obj->o_obj; + put_task_struct(tsk); + free_cpt_object(obj, ctx); + } + return err; +} + +static int cpt_collect(struct cpt_context *ctx) +{ + int err; + + if ((err = cpt_collect_mm(ctx)) != 0) + return err; + + if ((err = cpt_collect_sysv(ctx)) != 0) + return err; + + if ((err = cpt_collect_files(ctx)) != 0) + return err; + + if ((err = cpt_collect_fs(ctx)) != 0) + return err; + + if ((err = cpt_collect_namespace(ctx)) != 0) + return err; + + if ((err = cpt_collect_signals(ctx)) != 0) + return err; + + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_COLLECT, ctx) & NOTIFY_FAIL) + return -ECHRNG; + + return 0; +} + +static int cpt_dump_veinfo(cpt_context_t *ctx) +{ + struct cpt_veinfo_image *i = cpt_get_buf(ctx); + struct ve_struct *ve; + struct timespec delta; + struct ipc_namespace *ns; + + cpt_open_section(ctx, CPT_SECT_VEINFO); + cpt_open_object(NULL, ctx); + + memset(i, 0, sizeof(*i)); + + i->cpt_next = CPT_NULL; + i->cpt_object = CPT_OBJ_VEINFO; + i->cpt_hdrlen = sizeof(*i); + i->cpt_content = CPT_CONTENT_VOID; + + ve = get_exec_env(); + ns = ve->ve_ns->ipc_ns; + + if (ns->shm_ctlall > 0xFFFFFFFFU) + i->shm_ctl_all = 0xFFFFFFFFU; + if (ns->shm_ctlmax > 0xFFFFFFFFU) + i->shm_ctl_max = 0xFFFFFFFFU; + i->shm_ctl_mni = ns->shm_ctlmni; + + i->msg_ctl_max = ns->msg_ctlmax; + i->msg_ctl_mni = ns->msg_ctlmni; + i->msg_ctl_mnb = ns->msg_ctlmnb; + + BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); + i->sem_ctl_arr[0] = ns->sem_ctls[0]; + i->sem_ctl_arr[1] = ns->sem_ctls[1]; + i->sem_ctl_arr[2] = ns->sem_ctls[2]; + i->sem_ctl_arr[3] = ns->sem_ctls[3]; + + do_posix_clock_monotonic_gettime(&delta); + _set_normalized_timespec(&delta, + delta.tv_sec - ve->start_timespec.tv_sec, + delta.tv_nsec - ve->start_timespec.tv_nsec); + i->start_timespec_delta = cpt_timespec_export(&delta); + i->start_jiffies_delta = get_jiffies_64() - ve->start_jiffies; + + i->last_pid = ve->ve_ns->pid_ns->last_pid; + i->rnd_va_space = ve->_randomize_va_space + 1; + + ctx->write(i, sizeof(*i), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + return 0; +} + +static int cpt_dump_utsname(cpt_context_t *ctx) +{ + int len; + struct cpt_object_hdr o; + struct ve_struct *ve; + struct uts_namespace *ns; + + cpt_open_section(ctx, CPT_SECT_UTSNAME); + + ve = get_exec_env(); + ns = ve->ve_ns->uts_ns; + + cpt_open_object(NULL, ctx); + len = strlen(ns->name.nodename); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(ns->name.nodename, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + + cpt_open_object(NULL, ctx); + len = strlen(ns->name.domainname); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(ns->name.domainname, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + + cpt_close_section(ctx); + return 0; +} + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +static int cpt_dump_vsyscall(cpt_context_t *ctx) +{ + struct cpt_page_block *pgb = cpt_get_buf(ctx); + + cpt_open_section(ctx, CPT_SECT_VSYSCALL); + cpt_open_object(NULL, ctx); + + pgb->cpt_next = CPT_NULL; + pgb->cpt_object = CPT_OBJ_VSYSCALL; + pgb->cpt_hdrlen = sizeof(*pgb); + pgb->cpt_content = CPT_CONTENT_DATA; + pgb->cpt_start = cpt_ptr_export(vsyscall_addr); + pgb->cpt_end = pgb->cpt_start + PAGE_SIZE; + + ctx->write(pgb, sizeof(*pgb), ctx); + cpt_release_buf(ctx); + + ctx->write(vsyscall_addr, PAGE_SIZE, ctx); + + cpt_close_object(ctx); + cpt_close_section(ctx); + return 0; +} +#endif + +int cpt_dump(struct cpt_context *ctx) +{ + struct ve_struct *oldenv, *env; + struct nsproxy *old_ns; + int err, err2 = 0; + + if (!ctx->ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + + down_read(&env->op_sem); + err = -ESRCH; + if (!env->is_running) + goto out_noenv; + if (!env->is_locked) + goto out_noenv; + err = -EINVAL; + if (env->ve_ns->pid_ns->flags & PID_NS_HIDDEN) { + printk(KERN_WARNING "CT: checkpointing not supported yet" + " for hidden pid namespaces.\n"); + goto out_noenv; + } + + oldenv = set_exec_env(env); + old_ns = current->nsproxy; + current->nsproxy = env->ve_ns; + + /* Phase 2: real checkpointing */ + err = cpt_open_dumpfile(ctx); + if (err) + goto out; + + cpt_major_hdr_out(ctx); + + if (!err) + err = cpt_dump_veinfo(ctx); + if (!err) + err = cpt_dump_ubc(ctx); + if (!err) + err = cpt_dump_files(ctx); + if (!err) + err = cpt_dump_files_struct(ctx); + if (!err) + err = cpt_dump_fs_struct(ctx); + /* netdevices should be dumped after dumping open files + as we need to restore netdevice binding to /dev/net/tun file */ + if (!err) + err = cpt_dump_ifinfo(ctx); + if (!err) + err = cpt_dump_namespace(ctx); + if (!err) + err = cpt_dump_sighand(ctx); + if (!err) + err = cpt_dump_vm(ctx); + if (!err) + err = cpt_dump_sysvsem(ctx); + if (!err) + err = cpt_dump_sysvmsg(ctx); + if (!err) + err = cpt_dump_tasks(ctx); + if (!err) + err = cpt_dump_orphaned_sockets(ctx); +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + if (!err) + err = cpt_dump_ip_conntrack(ctx); +#endif + if (!err) { + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_DUMP, ctx) & NOTIFY_FAIL) + err = -ECHRNG; + } + if (!err) + err = cpt_dump_utsname(ctx); + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) + if (!err) + err = cpt_dump_vsyscall(ctx); +#endif + + if (!err) + err = cpt_dump_tail(ctx); + + err2 = cpt_close_dumpfile(ctx); + +out: + current->nsproxy = old_ns; + set_exec_env(oldenv); +out_noenv: + up_read(&env->op_sem); + put_ve(env); + return err ? : err2; +} + +int cpt_vps_suspend(struct cpt_context *ctx) +{ + struct ve_struct *oldenv, *env; + struct nsproxy *old_ns; + int err = 0; + + ctx->kernel_config_flags = test_kernel_config(); + cpt_object_init(ctx); + + if (!ctx->ve_id) { + env = get_exec_env(); + if (env == get_ve0()) + return -EINVAL; + wprintk("undefined ve_id\n"); + ctx->ve_id = env->veid; + get_ve(env); + } else { + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + } + +#ifdef CONFIG_VE_IPTABLES + ctx->iptables_mask = env->_iptables_modules; +#endif + ctx->features = env->features; + + down_write(&env->op_sem); + err = -ESRCH; + if (!env->is_running) + goto out_noenv; + + err = -EBUSY; + if (env->is_locked) + goto out_noenv; + env->is_locked = 1; + downgrade_write(&env->op_sem); + + oldenv = set_exec_env(env); + old_ns = current->nsproxy; + current->nsproxy = env->ve_ns; + + /* Phase 0: find and stop all the tasks */ + if ((err = vps_stop_tasks(ctx)) != 0) + goto out; + + if ((err = cpt_suspend_network(ctx)) != 0) + goto out_wake; + + /* At the moment all the state is frozen. We do not need to lock + * the state, which can be changed only if the tasks are running. + */ + + /* Phase 1: collect task tree */ + if ((err = vps_collect_tasks(ctx)) != 0) + goto out_wake; + + /* Phase 1': collect all the resources */ + if ((err = cpt_collect(ctx)) != 0) + goto out; + +out: + current->nsproxy = old_ns; + set_exec_env(oldenv); + up_read(&env->op_sem); + put_ve(env); + return err; + +out_noenv: + up_write(&env->op_sem); + put_ve(env); + return err; + +out_wake: + read_lock(&tasklist_lock); + wake_ve(ctx); + read_unlock(&tasklist_lock); + goto out; +} + +static void check_unsupported_netdevices(struct cpt_context *ctx, __u32 *caps) +{ + struct net *net = get_exec_env()->ve_netns; + struct net_device *dev; + + read_lock(&dev_base_lock); + for_each_netdev(net, dev) { + if (dev->netdev_ops->ndo_cpt == NULL) { + eprintk_ctx("unsupported netdevice %s\n", dev->name); + *caps |= (1<flags & _TIF_IA32)) + *caps |= flags & ((1<mm && p->mm->context.vdso) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + *caps |= flags & (1<mm && p->mm->context.vdso) + *caps |= flags & (1<= 0) { + switch (check_process_external(p)) { + case PIDTYPE_PID: + eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<nsproxy) { + ns = p->nsproxy->mnt_ns; + if (ns) + get_mnt_ns(ns); + } + task_unlock(p); + if (ns) { + if (ns != current->nsproxy->mnt_ns) { + eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<policy != SCHED_NORMAL) { + eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<pid, virt_pid(p), p->comm); + *caps |= (1<list) { + struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); + struct path p; + + p.dentry = mnt->mnt_root; + p.mnt = mnt; + spin_lock(&dcache_lock); + path = __d_path(&p, &env->root_path, + path_buf, PAGE_SIZE); + spin_unlock(&dcache_lock); + if (IS_ERR(path)) + continue; + + if (check_one_vfsmount(mnt)) { + eprintk_ctx("Unsupported filesystem %s\n", mnt->mnt_sb->s_type->name); + *caps |= (1<ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (env == NULL) + return -ESRCH; + + down_read(&env->op_sem); + err = -ESRCH; + if (!env->is_running) { + eprintk_ctx("CT is not running\n"); + goto out_noenv; + } + + err = -EBUSY; + if (env->is_locked) { + eprintk_ctx("CT is locked\n"); + goto out_noenv; + } + + *caps = flags & (1<nsproxy; + current->nsproxy = env->ve_ns; + + check_unsupported_netdevices(ctx, caps); + + read_lock(&tasklist_lock); + root = find_task_by_vpid(1); + if (!root) { + read_unlock(&tasklist_lock); + eprintk_ctx("cannot find ve init\n"); + err = -ESRCH; + goto out; + } + get_task_struct(root); + for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p)) + check_one_process(ctx, caps, flags, env, root, p); + read_unlock(&tasklist_lock); + + task_lock(root); + n = NULL; + if (root->nsproxy) { + n = root->nsproxy->mnt_ns; + if (n) + get_mnt_ns(n); + } + task_unlock(root); + if (n) { + char *path_buf; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) { + put_mnt_ns(n); + err = -ENOMEM; + goto out_root; + } + + check_unsupported_mounts(ctx, caps, env, n, path_buf); + + free_page((unsigned long) path_buf); + put_mnt_ns(n); + } + + err = 0; + +out_root: + put_task_struct(root); +out: + current->nsproxy = old_ns; + set_exec_env(old_env); +out_noenv: + up_read(&env->op_sem); + put_ve(env); + + return err; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_dump.h linux-2.6.32.48-openvz/kernel/cpt/cpt_dump.h --- linux-2.6.32.48/kernel/cpt/cpt_dump.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_dump.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,16 @@ +int cpt_dump(struct cpt_context *cpt); +int rst_undump(struct cpt_context *cpt); +int cpt_suspend(struct cpt_context *cpt); +int cpt_resume(struct cpt_context *cpt); +int cpt_kill(struct cpt_context *cpt); +int rst_clean(struct cpt_context *cpt); +int rst_resume(struct cpt_context *cpt); +int rst_kill(struct cpt_context *cpt); + +int cpt_freeze_one(pid_t pid, int freeze); +int cpt_vps_suspend(struct cpt_context *ctx); +int vps_rst_undump(struct cpt_context *ctx); + +int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps); + +int cpt_check_unsupported(struct task_struct *tsk, struct cpt_context *ctx); diff -urNp linux-2.6.32.48/kernel/cpt/cpt_epoll.c linux-2.6.32.48-openvz/kernel/cpt/cpt_epoll.c --- linux-2.6.32.48/kernel/cpt/cpt_epoll.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_epoll.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,113 @@ +/* + * + * kernel/cpt/cpt_epoll.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx) +{ + int err = 0; + struct file *file = obj->o_obj; + struct eventpoll *ep; + struct rb_node *rbp; + struct cpt_epoll_image ei; + + if (file->f_op != &eventpoll_fops) { + eprintk_ctx("bad epoll file\n"); + return -EINVAL; + } + + ep = file->private_data; + + /* eventpoll.c does not protect open /proc/N/fd, silly. + * Opener will get an invalid file with uninitialized private_data + */ + if (unlikely(ep == NULL)) { + eprintk_ctx("bad epoll device\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + ei.cpt_next = CPT_NULL; + ei.cpt_object = CPT_OBJ_EPOLL; + ei.cpt_hdrlen = sizeof(ei); + ei.cpt_content = CPT_CONTENT_ARRAY; + ei.cpt_file = obj->o_pos; + + ctx->write(&ei, sizeof(ei), ctx); + + mutex_lock(&epmutex); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + loff_t saved_obj; + cpt_object_t *tobj; + struct cpt_epoll_file_image efi; + struct epitem *epi; + epi = rb_entry(rbp, struct epitem, rbn); + tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx); + if (tobj == NULL) { + eprintk_ctx("epoll device refers to an external file\n"); + err = -EBUSY; + break; + } + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + efi.cpt_next = CPT_NULL; + efi.cpt_object = CPT_OBJ_EPOLL_FILE; + efi.cpt_hdrlen = sizeof(efi); + efi.cpt_content = CPT_CONTENT_VOID; + efi.cpt_file = tobj->o_pos; + efi.cpt_fd = epi->ffd.fd; + efi.cpt_events = epi->event.events; + efi.cpt_data = epi->event.data; + efi.cpt_revents = 0; + efi.cpt_ready = 0; + if (!list_empty(&epi->rdllink)) + efi.cpt_ready = 1; + + ctx->write(&efi, sizeof(efi), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + mutex_unlock(&epmutex); + + cpt_close_object(ctx); + + return err; +} + diff -urNp linux-2.6.32.48/kernel/cpt/cpt_exports.c linux-2.6.32.48-openvz/kernel/cpt/cpt_exports.c --- linux-2.6.32.48/kernel/cpt/cpt_exports.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_exports.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,13 @@ +#include +#include + +#include "cpt_obj.h" + +EXPORT_SYMBOL(alloc_cpt_object); +EXPORT_SYMBOL(intern_cpt_object); +EXPORT_SYMBOL(insert_cpt_object); +EXPORT_SYMBOL(__cpt_object_add); +EXPORT_SYMBOL(cpt_object_add); +EXPORT_SYMBOL(cpt_object_get); +EXPORT_SYMBOL(lookup_cpt_object); +EXPORT_SYMBOL(lookup_cpt_obj_bypos); diff -urNp linux-2.6.32.48/kernel/cpt/cpt_files.c linux-2.6.32.48-openvz/kernel/cpt/cpt_files.c --- linux-2.6.32.48/kernel/cpt/cpt_files.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_files.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,1782 @@ +/* + * + * kernel/cpt/cpt_files.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +static inline int is_signalfd_file(struct file *file) +{ + /* no other users of it yet */ + return file->f_op == &signalfd_fops; +} + +void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt) +{ + char *path; + struct path p; + unsigned long pg = __get_free_page(GFP_KERNEL); + + if (!pg) + return; + + p.dentry = d; + p.mnt = mnt; + path = d_path(&p, (char *)pg, PAGE_SIZE); + + if (!IS_ERR(path)) + eprintk("<%s>", path); + free_page(pg); +} + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, + int verify, cpt_context_t *ctx) +{ + if (d->d_inode->i_sb->s_magic == FSMAGIC_PROC && + proc_dentry_of_dead_task(d)) + return 0; + + if (path[0] == '/' && !(!IS_ROOT(d) && d_unhashed(d))) { + struct nameidata nd; + if (path_lookup(path, 0, &nd)) { + eprintk_ctx("d_path cannot be looked up %s\n", path); + return -EINVAL; + } + if (nd.path.dentry != d || (verify && nd.path.mnt != mnt)) { + if (!strcmp(path, "/dev/null")) { + /* + * epic kludge to workaround the case, when the + * init opens a /dev/null and then udevd + * overmounts the /dev with tmpfs + */ + path_put(&nd.path); + return 0; + } + + eprintk_ctx("d_path is invisible %s\n", path); + path_put(&nd.path); + return -EINVAL; + } + path_put(&nd.path); + } + return 0; +} + +static int +cpt_replaced(struct dentry * de, struct vfsmount *mnt, cpt_context_t * ctx) +{ + int result = 0; + +#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) + char *path; + unsigned long pg; + struct dentry * renamed_dentry; + struct path p; + + if (de->d_sb->s_magic != FSMAGIC_VEFS) + return 0; + if (de->d_inode->i_nlink != 0 || + atomic_read(&de->d_inode->i_writecount) > 0) + return 0; + + renamed_dentry = vefs_replaced_dentry(de); + if (renamed_dentry == NULL) + return 0; + + pg = __get_free_page(GFP_KERNEL); + if (!pg) + return 0; + + p.dentry = de; + p.mnt = mnt; + path = d_path(&p, (char *)pg, PAGE_SIZE); + if (!IS_ERR(path)) { + int len; + struct nameidata nd; + + len = pg + PAGE_SIZE - 1 - (unsigned long)path; + if (len >= sizeof("(deleted) ") - 1 && + !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { + len -= sizeof("(deleted) ") - 1; + path += sizeof("(deleted) ") - 1; + } + + if (path_lookup(path, 0, &nd) == 0) { + if (mnt == nd.path.mnt && + vefs_is_renamed_dentry(nd.path.dentry, renamed_dentry)) + result = 1; + path_put(&nd.path); + } + } + free_page(pg); +#endif + return result; +} + +static int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, + int replaced, int verify, cpt_context_t *ctx) +{ + int len; + char *path; + struct path p; + char *pg = cpt_get_buf(ctx); + loff_t saved; + + p.dentry = d; + p.mnt = mnt; + path = d_path(&p, pg, PAGE_SIZE); + len = PTR_ERR(path); + + if (IS_ERR(path)) { + struct cpt_object_hdr o; + char tmp[1]; + + /* VZ changes d_path() to return EINVAL, when path + * is not supposed to be visible inside VE. + * This changes behaviour of d_path() comparing + * to mainstream kernel, f.e. d_path() fails + * on any kind of shared memory. Maybe, there are + * another cases, but I am aware only about this one. + * So, we just ignore error on shmem mounts and proceed. + * Otherwise, checkpointing is prohibited because + * of reference to an invisible file. + */ + if (len != -EINVAL || + mnt != get_exec_env()->shmem_mnt) + eprintk_ctx("d_path err=%d\n", len); + else + len = 0; + + cpt_push_object(&saved, ctx); + cpt_open_object(NULL, ctx); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + tmp[0] = 0; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(tmp, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved, ctx); + + __cpt_release_buf(ctx); + return len; + } else { + struct cpt_object_hdr o; + + len = pg + PAGE_SIZE - 1 - path; + if (replaced && + len >= sizeof("(deleted) ") - 1 && + !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { + len -= sizeof("(deleted) ") - 1; + path += sizeof("(deleted) ") - 1; + } + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + path[len] = 0; + + if (cpt_verify_overmount(path, d, mnt, verify, ctx)) { + __cpt_release_buf(ctx); + return -EINVAL; + } + + cpt_push_object(&saved, ctx); + cpt_open_object(NULL, ctx); + ctx->write(&o, sizeof(o), ctx); + ctx->write(path, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved, ctx); + __cpt_release_buf(ctx); + } + return 0; +} + +int cpt_dump_string(const char *s, struct cpt_context *ctx) +{ + int len; + struct cpt_object_hdr o; + + cpt_open_object(NULL, ctx); + len = strlen(s); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(s, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} + +static int +cpt_dump_filename(struct file *file, int replaced, cpt_context_t *ctx) +{ + return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, 1, ctx); +} + +int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err; + struct cpt_inode_image *v = cpt_get_buf(ctx); + struct kstat sbuf; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_INODE; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) { + cpt_release_buf(ctx); + return err; + } + + v->cpt_dev = d->d_inode->i_sb->s_dev; + v->cpt_ino = d->d_inode->i_ino; + v->cpt_mode = sbuf.mode; + v->cpt_nlink = sbuf.nlink; + v->cpt_uid = sbuf.uid; + v->cpt_gid = sbuf.gid; + v->cpt_rdev = d->d_inode->i_rdev; + v->cpt_size = sbuf.size; + v->cpt_atime = cpt_timespec_export(&sbuf.atime); + v->cpt_mtime = cpt_timespec_export(&sbuf.mtime); + v->cpt_ctime = cpt_timespec_export(&sbuf.ctime); + v->cpt_blksize = sbuf.blksize; + v->cpt_blocks = sbuf.blocks; + v->cpt_sb = d->d_inode->i_sb->s_magic; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + +int cpt_collect_files(cpt_context_t * ctx) +{ + int err; + cpt_object_t *obj; + int index = 0; + + /* Collect process fd sets */ + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL) + return -ENOMEM; + } + + /* Collect files from fd sets */ + for_each_object(obj, CPT_OBJ_FILES) { + int fd; + struct files_struct *f = obj->o_obj; + + cpt_obj_setindex(obj, index++, ctx); + + if (obj->o_count != atomic_read(&f->count)) { + eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count)); + return -EBUSY; + } + + for (fd = 0; fd < f->fdt->max_fds; fd++) { + struct file *file = fcheck_files(f, fd); + if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL) + return -ENOMEM; + } + } + + /* Collect files queued by AF_UNIX sockets. */ + if ((err = cpt_collect_passedfds(ctx)) < 0) + return err; + + /* OK. At this point we should count all the references. */ + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + struct file *parent; + cpt_object_t *ino_obj; + + if (obj->o_count != atomic_long_read(&file->f_count)) { + eprintk_ctx("file struct is referenced outside %d %ld\n", obj->o_count, atomic_long_read(&file->f_count)); + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + return -EBUSY; + } + + switch (file->f_dentry->d_inode->i_sb->s_magic) { + case FSMAGIC_FUTEX: + case FSMAGIC_MQUEUE: + case FSMAGIC_BDEV: +#ifndef CONFIG_INOTIFY_USER + case FSMAGIC_INOTIFY: +#endif + eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic); + return -EBUSY; + } + + /* Collect inode. It is necessary mostly to resolve deleted + * hard links. */ + ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (ino_obj == NULL) + return -ENOMEM; + + parent = ino_obj->o_parent; + if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) + ino_obj->o_parent = file; + + if (S_ISCHR(file->f_dentry->d_inode->i_mode)) { + int maj = imajor(file->f_dentry->d_inode); + if (maj == PTY_MASTER_MAJOR || + (maj >= UNIX98_PTY_MASTER_MAJOR && + maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || + maj == PTY_SLAVE_MAJOR || + maj == UNIX98_PTY_SLAVE_MAJOR || + maj == TTYAUX_MAJOR) { + err = cpt_collect_tty(file, ctx); + if (err) + return err; + } + } + + if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { + err = cpt_collect_socket(file, ctx); + if (err) + return err; + } + } + + err = cpt_index_sockets(ctx); + + return err; +} + +/* /dev/ptmx is special, all the files share one inode, but real tty backend + * is attached via file->private_data. + */ + +static inline int is_cloning_inode(struct inode *ino) +{ + return S_ISCHR(ino->i_mode) && + ino->i_rdev == MKDEV(TTYAUX_MAJOR,2); +} + +static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx) +{ + pid_t pid; + struct cpt_flock_image *v = cpt_get_buf(ctx); + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_FLOCK; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_owner = owner; + + pid = fl->fl_pid; + if (pid) { + pid = pid_to_vpid(fl->fl_pid); + if (pid == -1) { + if (!(fl->fl_flags&FL_FLOCK)) { + eprintk_ctx("posix lock from another container?\n"); + cpt_release_buf(ctx); + return -EBUSY; + } + pid = 0; + } + } + + v->cpt_pid = pid; + v->cpt_start = fl->fl_start; + v->cpt_end = fl->fl_end; + v->cpt_flags = fl->fl_flags; + v->cpt_type = fl->fl_type; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + + +int cpt_dump_flock(struct file *file, struct cpt_context *ctx) +{ + int err = 0; + struct file_lock *fl; + + lock_kernel(); + for (fl = file->f_dentry->d_inode->i_flock; + fl; fl = fl->fl_next) { + if (file != fl->fl_file) + continue; + if (fl->fl_flags & FL_LEASE) { + eprintk_ctx("lease lock is not supported\n"); + err = -EINVAL; + break; + } + if (fl->fl_flags & FL_POSIX) { + cpt_object_t *obj; + obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx); + if (obj) { + dump_one_flock(fl, obj->o_index, ctx); + continue; + } else { + eprintk_ctx("unknown lock owner %p\n", fl->fl_owner); + err = -EINVAL; + } + } + if (fl->fl_flags & FL_FLOCK) { + dump_one_flock(fl, -1, ctx); + continue; + } + } + unlock_kernel(); + return err; +} + +static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx) +{ + int err = 0; + cpt_object_t *iobj; + struct cpt_file_image *v = cpt_get_buf(ctx); + struct kstat sbuf; + int replaced = 0; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILE; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_flags = file->f_flags; + v->cpt_mode = file->f_mode; + v->cpt_pos = file->f_pos; + v->cpt_uid = file->f_cred->uid; + v->cpt_gid = file->f_cred->gid; + + vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf); + + v->cpt_i_mode = sbuf.mode; + v->cpt_lflags = 0; + + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) { + v->cpt_lflags |= CPT_DENTRY_PROC; + if (proc_dentry_of_dead_task(file->f_dentry)) + v->cpt_lflags |= CPT_DENTRY_PROCPID_DEAD; + } + + if (IS_ROOT(file->f_dentry)) + v->cpt_lflags |= CPT_DENTRY_ROOT; + else if (d_unhashed(file->f_dentry)) { + if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) { + v->cpt_lflags |= CPT_DENTRY_REPLACED; + replaced = 1; + } else if (!(v->cpt_lflags & CPT_DENTRY_PROCPID_DEAD)) + v->cpt_lflags |= CPT_DENTRY_DELETED; + } + if (is_cloning_inode(file->f_dentry->d_inode)) + v->cpt_lflags |= CPT_DENTRY_CLONING; + + v->cpt_inode = CPT_NULL; + if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) { + iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (iobj) { + v->cpt_inode = iobj->o_pos; + if (iobj->o_flags & CPT_INODE_HARDLINKED) + v->cpt_lflags |= CPT_DENTRY_HARDLINKED; + } + } + v->cpt_priv = CPT_NULL; + v->cpt_fown_fd = -1; + if (S_ISCHR(v->cpt_i_mode)) { + iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx); + if (iobj) { + v->cpt_priv = iobj->o_pos; + if (file->f_flags&FASYNC) + v->cpt_fown_fd = cpt_tty_fasync(file, ctx); + } + if (imajor(file->f_dentry->d_inode) == MISC_MAJOR && + iminor(file->f_dentry->d_inode) == TUN_MINOR) + v->cpt_lflags |= CPT_DENTRY_TUNTAP; + } + if (S_ISSOCK(v->cpt_i_mode)) { + if (obj->o_index < 0) { + eprintk_ctx("BUG: no socket index\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_priv = obj->o_index; + if (file->f_flags&FASYNC) + v->cpt_fown_fd = cpt_socket_fasync(file, ctx); + } + if (file->f_op == &eventpoll_fops) { + v->cpt_priv = file->f_dentry->d_inode->i_ino; + v->cpt_lflags |= CPT_DENTRY_EPOLL; + } + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) { + v->cpt_priv = file->f_dentry->d_inode->i_ino; + v->cpt_lflags |= CPT_DENTRY_INOTIFY; + } + + v->cpt_fown_pid = (file->f_owner.pid == NULL ? + CPT_FOWN_STRAY_PID : pid_vnr(file->f_owner.pid)); + v->cpt_fown_uid = file->f_owner.uid; + v->cpt_fown_euid = file->f_owner.euid; + v->cpt_fown_signo = file->f_owner.signum; + + if (is_signalfd_file(file)) { + struct signalfd_ctx *ctx = file->private_data; + v->cpt_lflags |= CPT_DENTRY_SIGNALFD; + v->cpt_priv = cpt_sigset_export(&ctx->sigmask); + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (!S_ISSOCK(v->cpt_i_mode)) { + err = cpt_dump_filename(file, replaced, ctx); + if (err) + return err; + if ((file->f_mode & FMODE_WRITE) && + file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS) + vefs_track_notify(file->f_dentry, 1); + } + + if (file->f_dentry->d_inode->i_flock) + err = cpt_dump_flock(file, ctx); + + cpt_close_object(ctx); + + return err; +} + +/* About this weird function... Crappy code dealing with SYSV shared memory + * defines TMPFS inode and file with f_op doing only mmap. So... + * Maybe, this is wrong and leaks something. It is clear access to + * SYSV shmem via mmap is quite unusual and impossible from user space. + */ +static int dump_content_shm(struct file *file, struct cpt_context *ctx) +{ + struct cpt_obj_bits *v; + loff_t saved_pos; + unsigned long addr; + + addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size, + PROT_READ, MAP_SHARED, 0); + if (IS_ERR((void*)addr)) + return PTR_ERR((void*)addr); + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v = cpt_get_buf(ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = file->f_dentry->d_inode->i_size; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx); + ctx->align(ctx); + do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size); + + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + return 0; +} + +static int data_is_zero(char *addr, int len) +{ + int i; + unsigned long zerolong = 0; + + for (i=0; if_op == NULL) + return -EINVAL; + + do_read = file->f_op->read; + + if (file->f_op == &shm_file_operations || + file->f_op == &shmem_file_operations) { + + /* shmget uses shm ops */ + if (file->f_op == &shm_file_operations) { + struct shm_file_data *sfd = file->private_data; + file = sfd->file; + } + + cpt_dump_content_sysvshm(file, ctx); + + do_read = file->f_dentry->d_inode->i_fop->read; + if (!do_read) { + wprintk_ctx("TMPFS is not configured?\n"); + return dump_content_shm(file, ctx); + } + } + + if (!(file->f_mode & FMODE_READ) || + (file->f_flags & O_DIRECT)) { + struct file *filp; + filp = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), + O_RDONLY | O_LARGEFILE, + NULL /* not checked */); + if (IS_ERR(filp)) { + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(filp)); + return PTR_ERR(filp); + } + file = filp; + } else { + atomic_long_inc(&file->f_count); + } + + for (;;) { + mm_segment_t oldfs; + int err; + + (void)cpt_get_buf(ctx); + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos); + set_fs(oldfs); + if (err < 0) { + eprintk_ctx("dump_content_regular: do_read: %d", err); + fput(file); + __cpt_release_buf(ctx); + return err; + } + if (err == 0) { + __cpt_release_buf(ctx); + break; + } + if (data_is_zero(ctx->tmpbuf, err)) { + if (obj_opened != CPT_NULL) { + ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + obj_opened = CPT_NULL; + } + } else { + if (obj_opened == CPT_NULL) { + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + obj_opened = ctx->file->f_pos; + pgb.cpt_next = CPT_NULL; + pgb.cpt_object = CPT_OBJ_PAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_DATA; + pgb.cpt_start = pos - err; + pgb.cpt_end = pgb.cpt_start; + ctx->write(&pgb, sizeof(pgb), ctx); + } + ctx->write(ctx->tmpbuf, err, ctx); + pgb.cpt_end += err; + } + __cpt_release_buf(ctx); + } + + fput(file); + + if (obj_opened != CPT_NULL) { + ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + obj_opened = CPT_NULL; + } + return 0; +} + + +static int dump_content_chrdev(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + int maj; + + maj = imajor(ino); + if (maj == MEM_MAJOR) { + /* Well, OK. */ + return 0; + } + if (maj == PTY_MASTER_MAJOR || + (maj >= UNIX98_PTY_MASTER_MAJOR && + maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || + maj == PTY_SLAVE_MAJOR || + maj == UNIX98_PTY_SLAVE_MAJOR || + maj == TTYAUX_MAJOR) { + return cpt_dump_content_tty(file, ctx); + } + if (maj == MISC_MAJOR && iminor(ino) == TUN_MINOR) + return 0; + + eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino)); + return -EINVAL; +} + +static int dump_content_blkdev(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + + /* We are not going to transfer them. */ + eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino)); + return -EINVAL; +} + +static int dump_content_fifo(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + cpt_object_t *obj; + loff_t saved_pos; + int readers; + int writers; + int anon = 0; + + mutex_lock(&ino->i_mutex); + readers = ino->i_pipe->readers; + writers = ino->i_pipe->writers; + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file1 = obj->o_obj; + if (file1->f_dentry->d_inode == ino) { + if (file1->f_mode & FMODE_READ) + readers--; + if (file1->f_mode & FMODE_WRITE) + writers--; + } + } + mutex_unlock(&ino->i_mutex); + if (readers || writers) { + struct dentry *dr = file->f_dentry->d_sb->s_root; + if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0) + anon = 1; + + if (anon) { + eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers); + return -EBUSY; + } + /* If fifo has external readers/writers, we are in troubles. + * If the buffer is not empty, we must move its content. + * But if the fifo is owned by a service, we cannot do + * this. See? + * + * For now we assume, that if fifo is opened by another + * process, we do not own it and, hence, migrate without + * data. + */ + return 0; + } + + /* OK, we must save fifo state. No semaphores required. */ + + if (ino->i_pipe->nrbufs) { + struct cpt_obj_bits *v = cpt_get_buf(ctx); + struct pipe_inode_info *info; + int count, buf, nrbufs; + + mutex_lock(&ino->i_mutex); + info = ino->i_pipe; + count = 0; + buf = info->curbuf; + nrbufs = info->nrbufs; + while (--nrbufs >= 0) { + if (!info->bufs[buf].ops->can_merge) { + mutex_unlock(&ino->i_mutex); + eprintk_ctx("unknown format of pipe buffer\n"); + return -EINVAL; + } + count += info->bufs[buf].len; + buf = (buf+1) & (PIPE_BUFFERS-1); + } + + if (!count) { + mutex_unlock(&ino->i_mutex); + return 0; + } + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = count; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + count = 0; + buf = info->curbuf; + nrbufs = info->nrbufs; + while (--nrbufs >= 0) { + struct pipe_buffer *b = info->bufs + buf; + /* need to ->pin first? */ + void * addr = b->ops->map(info, b, 0); + ctx->write(addr + b->offset, b->len, ctx); + b->ops->unmap(info, b, addr); + buf = (buf+1) & (PIPE_BUFFERS-1); + } + + mutex_unlock(&ino->i_mutex); + + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + } + + return 0; +} + +static int dump_content_socket(struct file *file, struct cpt_context *ctx) +{ + return 0; +} + +struct cpt_dirent { + unsigned long ino; + char *name; + int namelen; + int found; +}; + +static int cpt_filldir(void * __buf, const char * name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct cpt_dirent * dirent = __buf; + + if ((ino == dirent->ino) && (namelen < PAGE_SIZE - 1)) { + memcpy(dirent->name, name, namelen); + dirent->name[namelen] = '\0'; + dirent->namelen = namelen; + dirent->found = 1; + return 1; + } + return 0; +} + +static int find_linked_dentry(struct dentry *d, struct vfsmount *mnt, + struct inode *ino, struct cpt_context *ctx) +{ + int err = -EBUSY; + struct file *f = NULL; + struct cpt_dirent entry; + struct dentry *de, *found = NULL; + + dprintk_ctx("deleted reference to existing inode, try to find file\n"); + /* 1. Try to find not deleted dentry in ino->i_dentry list */ + spin_lock(&dcache_lock); + list_for_each_entry(de, &ino->i_dentry, d_alias) { + if (!IS_ROOT(de) && d_unhashed(de)) + continue; + found = de; + dget_locked(found); + break; + } + spin_unlock(&dcache_lock); + if (found) { + err = cpt_dump_dentry(found, mnt, 0, 1, ctx); + dput(found); + if (!err) { + dprintk_ctx("dentry found in aliases\n"); + return 0; + } + } + + /* 2. Try to find file in current dir */ + de = dget_parent(d); + if (!de) + return -EINVAL; + + mntget(mnt); + f = dentry_open(de, mnt, O_RDONLY | O_LARGEFILE, NULL); + if (IS_ERR(f)) + return PTR_ERR(f); + + entry.ino = ino->i_ino; + entry.name = cpt_get_buf(ctx); + entry.found = 0; + err = vfs_readdir(f, cpt_filldir, &entry); + if (err || !entry.found) { + err = err ? err : -ENOENT; + goto err_readdir; + } + + found = lookup_one_len(entry.name, de, entry.namelen); + if (IS_ERR(found)) { + err = PTR_ERR(found); + goto err_readdir; + } + + err = -ENOENT; + if (found->d_inode != ino) + goto err_lookup; + + dprintk_ctx("dentry found in dir\n"); + __cpt_release_buf(ctx); + err = cpt_dump_dentry(found, mnt, 0, 1, ctx); + +err_lookup: + dput(found); +err_readdir: + fput(f); + __cpt_release_buf(ctx); + return err; +} + +static struct dentry *find_linkdir(struct vfsmount *mnt, struct cpt_context *ctx) +{ + int i; + + for (i = 0; i < ctx->linkdirs_num; i++) + if (ctx->linkdirs[i]->f_vfsmnt == mnt) + return ctx->linkdirs[i]->f_dentry; + return NULL; +} + +struct dentry *cpt_fake_link(struct dentry *d, struct vfsmount *mnt, + struct inode *ino, struct cpt_context *ctx) +{ + int err; + int order = 8; + const char *prefix = ".cpt_hardlink."; + int preflen = strlen(prefix) + order; + char name[preflen + 1]; + struct dentry *dirde, *hardde; + + dirde = find_linkdir(mnt, ctx); + if (!dirde) { + err = -ENOENT; + goto out; + } + + ctx->linkcnt++; + snprintf(name, sizeof(name), "%s%0*u", prefix, order, ctx->linkcnt); + + mutex_lock(&dirde->d_inode->i_mutex); + hardde = lookup_one_len(name, dirde, strlen(name)); + if (IS_ERR(hardde)) { + err = PTR_ERR(hardde); + goto out_unlock; + } + + if (hardde->d_inode) { + /* Userspace should clean hardlinked files from previous + * dump/undump + */ + eprintk_ctx("Hardlinked file already exists: %s\n", name); + err = -EEXIST; + goto out_put; + } + + if (d == NULL) + err = vfs_create(dirde->d_inode, hardde, 0600, NULL); + else + err = vfs_link(d, dirde->d_inode, hardde); + if (err) { + eprintk_ctx("error hardlink %s, %d\n", name, err); + goto out_put; + } + +out_unlock: + mutex_unlock(&dirde->d_inode->i_mutex); +out: + return err ? ERR_PTR(err) : hardde; + +out_put: + dput(hardde); + goto out_unlock; +} + +static int create_dump_hardlink(struct dentry *d, struct vfsmount *mnt, + struct inode *ino, struct cpt_context *ctx) +{ + int err; + struct dentry *hardde; + + hardde = cpt_fake_link(d, mnt, ino, ctx); + if (IS_ERR(hardde)) + return PTR_ERR(hardde); + + err = cpt_dump_dentry(hardde, mnt, 0, 1, ctx); + dput(hardde); + + return err; +} + +static int dump_one_inode(struct file *file, struct dentry *d, + struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err = 0; + struct inode *ino = d->d_inode; + cpt_object_t *iobj; + int dump_it = 0; + + iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx); + if (!iobj) + return -EINVAL; + + if (iobj->o_pos >= 0) + return 0; + + if (ino->i_sb->s_magic == FSMAGIC_PROC && + proc_dentry_of_dead_task(d)) + return 0; + + if ((!IS_ROOT(d) && d_unhashed(d)) && + !cpt_replaced(d, mnt, ctx)) + dump_it = 1; + if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) { + if (file->f_op == &eventpoll_fops || + is_signalfd_file(file)) + return 0; + dump_it = 1; + } + + if (!dump_it) + return 0; + + cpt_open_object(iobj, ctx); + cpt_dump_inode(d, mnt, ctx); + + if (!IS_ROOT(d) && d_unhashed(d)) { + struct file *parent; + parent = iobj->o_parent; + if (!parent || + (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) { + /* Inode is not deleted, but it does not + * have references from inside checkpointed + * process group. */ + if (ino->i_nlink != 0) { + err = find_linked_dentry(d, mnt, ino, ctx); + if (err && S_ISREG(ino->i_mode)) { + err = create_dump_hardlink(d, mnt, ino, ctx); + iobj->o_flags |= CPT_INODE_HARDLINKED; + } else if (S_ISCHR(ino->i_mode) || + S_ISBLK(ino->i_mode) || + S_ISFIFO(ino->i_mode)) + err = 0; + + if (err) { + eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err); + return -EBUSY; + } + if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) + dump_it = 0; + } + } else { + /* Refer to _another_ file name. */ + err = cpt_dump_filename(parent, 0, ctx); + if (err) + return err; + if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) + dump_it = 0; + } + } + if (dump_it) { + if (S_ISREG(ino->i_mode)) { + if ((err = dump_content_regular(file, ctx)) != 0) { + eprintk_ctx("dump_content_regular "); + cpt_printk_dentry(d, mnt); + } + } else if (S_ISDIR(ino->i_mode)) { + /* We cannot do anything. The directory should be + * empty, so it is not a big deal. + */ + } else if (S_ISCHR(ino->i_mode)) { + err = dump_content_chrdev(file, ctx); + } else if (S_ISBLK(ino->i_mode)) { + err = dump_content_blkdev(file, ctx); + } else if (S_ISFIFO(ino->i_mode)) { + err = dump_content_fifo(file, ctx); + } else if (S_ISSOCK(ino->i_mode)) { + err = dump_content_socket(file, ctx); + } else { + eprintk_ctx("unknown inode mode %o, magic 0x%lx\n", ino->i_mode & S_IFMT, ino->i_sb->s_magic); + err = -EINVAL; + } + } + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_files(struct cpt_context *ctx) +{ + int epoll_nr, inotify_nr; + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_TTY); + for_each_object(obj, CPT_OBJ_TTY) { + int err; + + if ((err = cpt_dump_tty(obj, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + cpt_open_section(ctx, CPT_SECT_INODE); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + int err; + + if ((err = dump_one_inode(file, file->f_dentry, + file->f_vfsmnt, ctx)) != 0) + return err; + } + for_each_object(obj, CPT_OBJ_FS) { + struct fs_struct *fs = obj->o_obj; + int err; + + if (fs->root.dentry && + (err = dump_one_inode(NULL, fs->root.dentry, fs->root.mnt, ctx)) != 0) + return err; + if (fs->pwd.dentry && + (err = dump_one_inode(NULL, fs->pwd.dentry, fs->pwd.mnt, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + epoll_nr = 0; + inotify_nr = 0; + cpt_open_section(ctx, CPT_SECT_FILES); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + int err; + + if ((err = dump_one_file(obj, file, ctx)) != 0) + return err; + if (file->f_op == &eventpoll_fops) + epoll_nr++; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) + inotify_nr++; + } + cpt_close_section(ctx); + + if (epoll_nr) { + cpt_open_section(ctx, CPT_SECT_EPOLL); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + if (file->f_op == &eventpoll_fops) { + int err; + if ((err = cpt_dump_epolldev(obj, ctx)) != 0) + return err; + } + } + cpt_close_section(ctx); + } + + if (inotify_nr) { + cpt_open_section(ctx, CPT_SECT_INOTIFY); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) { + int err = -EINVAL; +#ifdef CONFIG_INOTIFY_USER + if ((err = cpt_dump_inotify(obj, ctx)) != 0) +#endif + return err; + } + } + cpt_close_section(ctx); + } + + cpt_open_section(ctx, CPT_SECT_SOCKET); + for_each_object(obj, CPT_OBJ_SOCKET) { + int err; + + if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + return 0; +} + +static int dump_filedesc(int fd, struct file *file, + struct files_struct *f, struct cpt_context *ctx) +{ + struct cpt_fd_image *v = cpt_get_buf(ctx); + cpt_object_t *obj; + + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILEDESC; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_fd = fd; + obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx); + if (!obj) BUG(); + v->cpt_file = obj->o_pos; + v->cpt_flags = 0; + if (FD_ISSET(fd, f->fdt->close_on_exec)) + v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + + return 0; +} + +static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct files_struct *f = obj->o_obj; + struct cpt_files_struct_image *v = cpt_get_buf(ctx); + int fd; + loff_t saved_obj; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILES; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_index = obj->o_index; + v->cpt_max_fds = f->fdt->max_fds; + v->cpt_next_fd = f->next_fd; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + for (fd = 0; fd < f->fdt->max_fds; fd++) { + struct file *file = fcheck_files(f, fd); + if (file) + dump_filedesc(fd, file, f, ctx); + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return 0; +} + +int cpt_dump_files_struct(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_FILES_STRUCT); + + for_each_object(obj, CPT_OBJ_FILES) { + int err; + + if ((err = dump_one_file_struct(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +int cpt_collect_fs(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->fs) { + if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->pwd.dentry && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd.dentry->d_inode, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->root.dentry && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->root.dentry->d_inode, ctx) == NULL) + return -ENOMEM; + } + } + return 0; +} + +int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) +{ + struct file file; + + memset(&file, 0, sizeof(file)); + + file.f_dentry = d; + file.f_vfsmnt = mnt; + file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK; + file.f_cred = current->cred; + + return dump_one_file(NULL, &file, ctx); +} + +static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct fs_struct *fs = obj->o_obj; + struct cpt_fs_struct_image *v = cpt_get_buf(ctx); + loff_t saved_obj; + int err; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_umask = fs->umask; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + err = cpt_dump_dir(fs->root.dentry, fs->root.mnt, ctx); + if (!err) + err = cpt_dump_dir(fs->pwd.dentry, fs->pwd.mnt, ctx); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_fs_struct(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_FS); + + for_each_object(obj, CPT_OBJ_FS) { + int err; + + if ((err = dump_one_fs(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err = 0; + struct mnt_namespace *n = obj->o_obj; + struct list_head *p; + char *path_buf, *path; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) + return -ENOMEM; + + down_read(&namespace_sem); + list_for_each(p, &n->list) { + struct path pt; + struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); + + pt.dentry = mnt->mnt_root; + pt.mnt = mnt; + path = d_path(&pt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + + if (check_one_vfsmount(mnt)) { + eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name); + err = -EINVAL; + break; + } + } + up_read(&namespace_sem); + + free_page((unsigned long) path_buf); + + return err; +} + +int cpt_collect_namespace(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->nsproxy && tsk->nsproxy->mnt_ns && + cpt_object_add(CPT_OBJ_NAMESPACE, + tsk->nsproxy->mnt_ns, ctx) == NULL) + return -ENOMEM; + } + + for_each_object(obj, CPT_OBJ_NAMESPACE) { + int err; + if ((err = check_one_namespace(obj, ctx)) != 0) + return err; + } + + return 0; +} + +struct args_t +{ + int* pfd; + char* path; + envid_t veid; +}; + +static int dumptmpfs(void *arg) +{ + int i; + struct args_t *args = arg; + int *pfd = args->pfd; + int fd0, fd2; + char *path = args->path; + char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL }; + + i = real_env_create(args->veid, VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump tmpfs\n"); + module_put(THIS_MODULE); + return 255 << 8; + } + + if (pfd[1] != 1) + sc_dup2(pfd[1], 1); + set_fs(KERNEL_DS); + fd0 = sc_open("/dev/null", O_RDONLY, 0); + fd2 = sc_open("/dev/null", O_WRONLY, 0); + if (fd0 < 0 || fd2 < 0) { + eprintk("can not open /dev/null for tar: %d %d\n", fd0, fd2); + module_put(THIS_MODULE); + return 255 << 8; + } + if (fd0 != 0) + sc_dup2(fd0, 0); + if (fd2 != 2) + sc_dup2(fd2, 2); + + for (i = 3; i < current->files->fdt->max_fds; i++) { + sc_close(i); + } + + module_put(THIS_MODULE); + + i = sc_execve("/bin/tar", argv, NULL); + eprintk("failed to exec /bin/tar: %d\n", i); + return 255 << 8; +} + +static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx) +{ + int err; + int pid; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + char buf[16]; + int n; + loff_t saved_obj; + struct args_t args; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + struct ve_struct *oldenv; + + err = sc_pipe(pfd); + if (err < 0) + return err; + args.pfd = pfd; + args.path = path; + args.veid = VEID(get_exec_env()); + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + oldenv = set_exec_env(get_ve0()); + err = pid = local_kernel_thread(dumptmpfs, (void*)&args, + SIGCHLD | CLONE_VFORK, 0); + set_exec_env(oldenv); + if (err < 0) { + eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[0]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NAME; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&v, sizeof(v), ctx); + + do { + oldfs = get_fs(); set_fs(KERNEL_DS); + n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); + set_fs(oldfs); + if (n > 0) + ctx->write(buf, n, ctx); + } while (n > 0); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("tar exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("tar terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + buf[0] = 0; + ctx->write(buf, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + return n ? : err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +static int loopy_root(struct vfsmount *mnt) +{ + struct list_head *p; + + list_for_each(p, &mnt->mnt_ns->list) { + struct vfsmount * m = list_entry(p, struct vfsmount, mnt_list); + if (m == mnt) + return 0; + if (m->mnt_sb == mnt->mnt_sb) + return 1; + } + /* Cannot happen */ + return 0; +} + +static int cpt_dump_bind_mnt(struct vfsmount * mnt, cpt_context_t * ctx) +{ + struct list_head *p; + int err = -EINVAL; + + /* One special case: mount --bind /a /a */ + if (mnt->mnt_root == mnt->mnt_mountpoint) + return cpt_dump_dentry(mnt->mnt_root, mnt, 0, 0, ctx); + + list_for_each_prev(p, &mnt->mnt_list) { + struct vfsmount * m; + + if (p == &mnt->mnt_ns->list) + break; + + m = list_entry(p, struct vfsmount, mnt_list); + + if (m->mnt_sb != mnt->mnt_sb) + continue; + + err = cpt_dump_dentry(mnt->mnt_root, m, 0, 1, ctx); + if (err == 0) + break; + } + return err; +} + +static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err = 0; + struct cpt_vfsmount_image v; + loff_t saved_obj; + char *path_buf, *path; + struct path p; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) + return -ENOMEM; + + p.dentry = mnt->mnt_root; + p.mnt = mnt; + path = d_path(&p, path_buf, PAGE_SIZE); + if (IS_ERR(path)) { + free_page((unsigned long) path_buf); + return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path); + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_VFSMOUNT; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + v.cpt_mntflags = mnt->mnt_flags; + if (top_beancounter(slab_ub(mnt)) != top_beancounter(get_exec_ub())) { + v.cpt_mntflags |= CPT_MNT_EXT; + } else { + if (mnt->mnt_root != mnt->mnt_sb->s_root || loopy_root(mnt)) + v.cpt_mntflags |= CPT_MNT_BIND; + } + v.cpt_flags = mnt->mnt_sb->s_flags; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_dump_string(mnt->mnt_devname ? : "none", ctx); + cpt_dump_string(path, ctx); + cpt_dump_string(mnt->mnt_sb->s_type->name, ctx); + + if (v.cpt_mntflags & CPT_MNT_BIND) { + err = cpt_dump_bind_mnt(mnt, ctx); + + /* Temporary solution for Ubuntu 8.04 */ + if (err == -EINVAL && !strcmp(path, "/dev/.static/dev")) { + cpt_dump_string("/dev", ctx); + err = 0; + } + } + else if (!(v.cpt_mntflags & CPT_MNT_EXT)) { + + if (mnt->mnt_sb->s_type->fs_flags & FS_REQUIRES_DEV) { + eprintk_ctx("Checkpoint supports only nodev fs: %s\n", + mnt->mnt_sb->s_type->name); + err = -EXDEV; + } else if (!strcmp(mnt->mnt_sb->s_type->name, "tmpfs")) { + mntget(mnt); + up_read(&namespace_sem); + err = cpt_dump_tmpfs(path, ctx); + down_read(&namespace_sem); + if (!err && list_empty(&mnt->mnt_list)) + err = -EBUSY; + mntput(mnt); + } + } + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + if (!err && mnt->mnt_sb->s_magic == FSMAGIC_VEFS) + vefs_track_force_stop(mnt->mnt_sb); + + free_page((unsigned long) path_buf); + + return err; +} + +static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct mnt_namespace *n = obj->o_obj; + struct cpt_object_hdr v; + struct vfsmount *rootmnt, *p; + loff_t saved_obj; + int err = 0; + + cpt_open_object(obj, ctx); + + v.cpt_next = -1; + v.cpt_object = CPT_OBJ_NAMESPACE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + + down_read(&namespace_sem); + rootmnt = n->root; + for (p = rootmnt; p; p = next_mnt(p, rootmnt)) { + err = dump_vfsmount(p, ctx); + if (err) + break; + } + up_read(&namespace_sem); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_namespace(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_NAMESPACE); + + for_each_object(obj, CPT_OBJ_NAMESPACE) { + int err; + + if ((err = dump_one_namespace(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_files.h linux-2.6.32.48-openvz/kernel/cpt/cpt_files.h --- linux-2.6.32.48/kernel/cpt/cpt_files.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_files.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,77 @@ +int cpt_collect_files(cpt_context_t *); +int cpt_collect_fs(cpt_context_t *); +int cpt_collect_namespace(cpt_context_t *); +int cpt_collect_sysvsem_undo(cpt_context_t *); +int cpt_collect_tty(struct file *, cpt_context_t *); +int cpt_dump_files(struct cpt_context *ctx); +int cpt_dump_files_struct(struct cpt_context *ctx); +int cpt_dump_fs_struct(struct cpt_context *ctx); +int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx); +int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx); +int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx); +struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx); +struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx); +struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx); +__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx); + +int rst_posix_locks(struct cpt_context *ctx); + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx); +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_files_std(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_restore_fs(struct cpt_context *ctx); + +int cpt_collect_sysv(cpt_context_t *); +int cpt_dump_sysvsem(struct cpt_context *ctx); +int cpt_dump_sysvmsg(struct cpt_context *ctx); +int rst_sysv_ipc(struct cpt_context *ctx); +int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx); + +int cpt_dump_namespace(struct cpt_context *ctx); +int rst_root_namespace(struct cpt_context *ctx); + +int rst_stray_files(struct cpt_context *ctx); +int rst_tty_jobcontrol(struct cpt_context *ctx); + +void rst_flush_filejobs(struct cpt_context *); +int rst_do_filejobs(struct cpt_context *); + +extern struct file_operations eventpoll_fops; +extern struct file_operations signalfd_fops; + +int rst_eventpoll(struct cpt_context *); +struct file *cpt_open_epolldev(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx); +int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *); + +int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx); +int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp, + loff_t *pos, struct cpt_context *ctx); + +int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx); +int rst_inotify(cpt_context_t *ctx); +struct file *rst_open_inotify(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx); + +struct dentry *cpt_fake_link(struct dentry *d, struct vfsmount *mnt, + struct inode *ino, struct cpt_context *ctx); + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, + int verify, cpt_context_t *ctx); + +#define check_one_vfsmount(mnt) \ + (strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "ext2") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "unionfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "binfmt_misc") != 0) diff -urNp linux-2.6.32.48/kernel/cpt/cpt_fsmagic.h linux-2.6.32.48-openvz/kernel/cpt/cpt_fsmagic.h --- linux-2.6.32.48/kernel/cpt/cpt_fsmagic.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_fsmagic.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,17 @@ +/* Collected from kernel sources. */ + +#define FSMAGIC_TMPFS 0x01021994 +#define FSMAGIC_PIPEFS 0x50495045 +#define FSMAGIC_SOCKFS 0x534F434B +#define FSMAGIC_PFMFS 0xa0b4d889 +#define FSMAGIC_BDEV 0x62646576 +#define FSMAGIC_FUTEX 0x0BAD1DEA +#define FSMAGIC_INOTIFY 0x2BAD1DEA +#define FSMAGIC_MQUEUE 0x19800202 +#define FSMAGIC_PROC 0x9fa0 +#define FSMAGIC_DEVPTS 0x1CD1 +#define FSMAGIC_AUTOFS 0x0187 +#define FSMAGIC_EXT2 0xEF53 +#define FSMAGIC_REISER 0x52654973 +#define FSMAGIC_VEFS 0x565a4653 +#define FSMAGIC_ANON 0x09041934 diff -urNp linux-2.6.32.48/kernel/cpt/cpt_inotify.c linux-2.6.32.48-openvz/kernel/cpt/cpt_inotify.c --- linux-2.6.32.48/kernel/cpt/cpt_inotify.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_inotify.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,174 @@ +/* + * + * kernel/cpt/cpt_inotify.c + * + * Copyright (C) 2000-2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../fs/notify/inotify/inotify.h" + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +static int dump_watch_inode(struct path *path, cpt_context_t *ctx) +{ + int err; + struct dentry *d; + + d = path->dentry; + if (IS_ROOT(d) || !d_unhashed(d)) + goto dump_dir; + + d = cpt_fake_link(d->d_inode->i_nlink ? d : NULL, + path->mnt, d->d_inode, ctx); + + if (IS_ERR(d)) + return PTR_ERR(d); + +dump_dir: + err = cpt_dump_dir(d, path->mnt, ctx); + if (d != path->dentry) + dput(d); + + return err; +} + +static int cpt_dump_watches(struct fsnotify_group *g, struct cpt_context *ctx) +{ + int err = 0; + struct fsnotify_mark_entry *fse; + struct inotify_inode_mark_entry *ie; + struct cpt_inotify_wd_image wi; + loff_t saved_obj; + + /* FIXME locking */ + list_for_each_entry(fse, &g->mark_entries, g_list) { + struct path path; + + ie = container_of(fse, struct inotify_inode_mark_entry, + fsn_entry); + + cpt_open_object(NULL, ctx); + + wi.cpt_next = CPT_NULL; + wi.cpt_object = CPT_OBJ_INOTIFY_WATCH; + wi.cpt_hdrlen = sizeof(wi); + wi.cpt_content = CPT_CONTENT_ARRAY; + wi.cpt_wd = ie->wd; + wi.cpt_mask = fse->mask; + + ctx->write(&wi, sizeof(wi), ctx); + + cpt_push_object(&saved_obj, ctx); + spin_lock(&fse->lock); + if (ie->path.dentry == NULL) { + err = -EINVAL; + eprintk_ctx("inotify mark without path\n"); + spin_unlock(&fse->lock); + break; + } + + path = ie->path; + path_get(&path); + spin_unlock(&fse->lock); + + err = dump_watch_inode(&path, ctx); + cpt_pop_object(&saved_obj, ctx); + path_put(&path); + + if (err) + break; + + cpt_close_object(ctx); + } + + return err; +} + +static int cpt_dump_events(struct fsnotify_group *g, struct cpt_context *ctx) +{ + /* FIXME - implement */ + if (!list_empty(&g->notification_list)) + wprintk_ctx("Inotify events are lost. Sorry...\n"); + + return 0; +} + +int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx) +{ + int err; + struct file *file = obj->o_obj; + struct fsnotify_group *group; + struct cpt_inotify_image ii; + loff_t saved_obj; + + if (file->f_op != &inotify_fops) { + eprintk_ctx("bad inotify file\n"); + return -EINVAL; + } + + group = file->private_data; + if (unlikely(group == NULL)) { + eprintk_ctx("bad inotify group\n"); + return -EINVAL; + } + + if (group->inotify_data.fa != NULL) { + eprintk_ctx("inotify with fasync\n"); + return -ENOTSUPP; + } + + cpt_open_object(NULL, ctx); + + ii.cpt_next = CPT_NULL; + ii.cpt_object = CPT_OBJ_INOTIFY; + ii.cpt_hdrlen = sizeof(ii); + ii.cpt_content = CPT_CONTENT_ARRAY; + ii.cpt_file = obj->o_pos; + ii.cpt_user = group->inotify_data.user->uid; + ii.cpt_max_events = group->max_events; + ii.cpt_last_wd = group->max_events; + + ctx->write(&ii, sizeof(ii), ctx); + cpt_push_object(&saved_obj, ctx); + + err = cpt_dump_watches(group, ctx); + if (err == 0) + err = cpt_dump_events(group, ctx); + + cpt_pop_object(&saved_obj, ctx); + cpt_close_object(ctx); + + return err; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_kernel.c linux-2.6.32.48-openvz/kernel/cpt/cpt_kernel.c --- linux-2.6.32.48/kernel/cpt/cpt_kernel.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_kernel.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,185 @@ +/* + * + * kernel/cpt/cpt_kernel.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#define __KERNEL_SYSCALLS__ 1 + +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include +#include + +#include "cpt_kernel.h" +#include "cpt_syscalls.h" + +int debug_level = 1; + +#ifdef CONFIG_X86_32 + +/* + * Create a kernel thread + */ +extern void kernel_thread_helper(void); +int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.bx = (unsigned long) fn; + regs.dx = (unsigned long) arg; + + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL, pid); +} +#endif + +#ifdef CONFIG_IA64 +pid_t +asm_kernel_thread (int (*fn)(void *), void *arg, unsigned long flags, pid_t pid) +{ + extern void start_kernel_thread (void); + unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; + struct { + struct switch_stack sw; + struct pt_regs pt; + } regs; + + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ + regs.pt.r9 = (unsigned long) fn; /* 1st argument */ + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; + regs.sw.pr = (1 << 2 /*PRED_KERNEL_STACK*/); + return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL, pid); +} +#endif + +int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) +{ + pid_t ret; + + if (current->fs == NULL) { + /* do_fork_pid() hates processes without fs, oopses. */ + printk("CPT BUG: local_kernel_thread: current->fs==NULL\n"); + return -EINVAL; + } + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + while ((ret = asm_kernel_thread(fn, arg, flags, pid)) == + -ERESTARTNOINTR) + cond_resched(); + if (ret < 0) + module_put(THIS_MODULE); + return ret; +} + +#ifdef __i386__ +int __execve(const char *file, char **argv, char **envp) +{ + long res; + __asm__ volatile ("int $0x80" + : "=a" (res) + : "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)), + "d" ((long)(envp)) : "memory"); + return (int)res; +} +#endif + +int sc_execve(char *cmd, char **argv, char **env) +{ + int ret; +#ifndef __i386__ + ret = kernel_execve(cmd, argv, env); +#else + ret = __execve(cmd, argv, env); +#endif + return ret; +} + +unsigned int test_cpu_caps_and_features(void) +{ + unsigned int flags = 0; + +#ifdef CONFIG_X86 + if (boot_cpu_has(X86_FEATURE_CMOV)) + flags |= 1 << CPT_CPU_X86_CMOV; + if (cpu_has_fxsr) + flags |= 1 << CPT_CPU_X86_FXSR; + if (cpu_has_xmm) + flags |= 1 << CPT_CPU_X86_SSE; +#ifndef CONFIG_X86_64 + if (cpu_has_xmm2) +#endif + flags |= 1 << CPT_CPU_X86_SSE2; + if (cpu_has_mmx) + flags |= 1 << CPT_CPU_X86_MMX; + if (boot_cpu_has(X86_FEATURE_3DNOW)) + flags |= 1 << CPT_CPU_X86_3DNOW; + if (boot_cpu_has(X86_FEATURE_3DNOWEXT)) + flags |= 1 << CPT_CPU_X86_3DNOW2; + if (boot_cpu_has(X86_FEATURE_SYSCALL)) + flags |= 1 << CPT_CPU_X86_SYSCALL; +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_SYSCALL) && + boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + flags |= 1 << CPT_CPU_X86_SYSCALL32; +#endif + if (boot_cpu_has(X86_FEATURE_SEP) +#ifdef CONFIG_X86_64 + && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL +#endif + ) + flags |= ((1 << CPT_CPU_X86_SEP) | (1 << CPT_CPU_X86_SEP32)); +#ifdef CONFIG_X86_64 + flags |= 1 << CPT_CPU_X86_EMT64; +#endif +#endif +#ifdef CONFIG_IA64 + flags |= 1 << CPT_CPU_X86_IA64; + flags |= 1 << CPT_CPU_X86_FXSR; +#endif + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_TEST, NULL) & NOTIFY_FAIL) + flags |= 1 << CPT_SLM_DMPRST; + return flags; +} + +unsigned int test_kernel_config(void) +{ + unsigned int flags = 0; +#ifdef CONFIG_X86 +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + flags |= 1 << CPT_KERNEL_CONFIG_PAE; +#endif +#endif + return flags; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_kernel.h linux-2.6.32.48-openvz/kernel/cpt/cpt_kernel.h --- linux-2.6.32.48/kernel/cpt/cpt_kernel.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_kernel.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,99 @@ +/* Interface to kernel vars which we had to _add_. */ + +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) +#define TASK_TRACED TASK_STOPPED +#define unix_peer(sk) ((sk)->sk_pair) +#define page_mapcount(pg) ((pg)->mapcount) +#else +#define unix_peer(sk) (unix_sk(sk)->peer) +#endif + +#ifdef CONFIG_IA64 +#define cpu_has_fxsr 1 +#endif + +#define CPT_SIG_IGNORE_MASK (\ + (1 << (SIGCONT - 1)) | (1 << (SIGCHLD - 1)) | \ + (1 << (SIGWINCH - 1)) | (1 << (SIGURG - 1))) + +static inline void do_gettimespec(struct timespec *ts) +{ + struct timeval tv; + do_gettimeofday(&tv); + ts->tv_sec = tv.tv_sec; + ts->tv_nsec = tv.tv_usec*1000; +} + +int local_kernel_thread(int (*fn)(void *), + void * arg, + unsigned long flags, + pid_t pid); +int asm_kernel_thread(int (*fn)(void *), + void * arg, + unsigned long flags, + pid_t pid); + +#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) +void vefs_track_force_stop(struct super_block *super); + +void vefs_track_notify(struct dentry *vdentry, int track_cow); + +struct dentry * vefs_replaced_dentry(struct dentry *de); +int vefs_is_renamed_dentry(struct dentry *vde, struct dentry *pde); +#else +static inline void vefs_track_force_stop(struct super_block *super) { }; + +static inline void vefs_track_notify(struct dentry *vdentry, int track_cow) { }; +#endif + +unsigned int test_cpu_caps_and_features(void); +unsigned int test_kernel_config(void); + +#define test_one_flag_old(src, dst, flag, message, ret) \ +if (src & (1 << flag)) \ + if (!(dst & (1 << flag))) { \ + wprintk("Destination cpu does not have " message "\n"); \ + ret = 1; \ + } +#define test_one_flag(src, dst, flag, message, ret) \ +if (src & (1 << flag)) \ + if (!(dst & (1 << flag))) { \ + eprintk_ctx("Destination cpu does not have " message "\n"); \ + ret = 1; \ + } + +static inline void +_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static inline struct timespec +_ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + s32 rem; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_mm.c linux-2.6.32.48-openvz/kernel/cpt/cpt_mm.c --- linux-2.6.32.48/kernel/cpt/cpt_mm.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_mm.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,923 @@ +/* + * + * kernel/cpt/cpt_mm.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +#include "cpt_pagein.h" +#endif +#include "cpt_ubc.h" + +static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, + cpt_context_t *ctx) +{ + if (!list_empty(&aio_ctx->run_list)) { + /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */ + eprintk_ctx("run list is not empty, cannot suspend AIO\n"); + return -EBUSY; + } + + /* Wait for pending IOCBs. Linux AIO is mostly _fake_. + * It is actually synchronous, except for direct IO and + * some funny raw USB things, which cannot happen inside VE. + * However, we do this for future. + * + * Later note: in 2.6.16 we may allow O_DIRECT, so that + * it is not meaningless code. + */ + wait_for_all_aios(aio_ctx); + + if (!list_empty(&aio_ctx->run_list) || + !list_empty(&aio_ctx->active_reqs) || + aio_ctx->reqs_active) { + eprintk_ctx("were not able to suspend AIO\n"); + return -EBUSY; + } + + return 0; +} + +static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx) +{ + struct vm_area_struct *vma; + struct hlist_node *n; + struct kioctx *aio_ctx; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file) { + if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL) + return -ENOMEM; + } + } + + if (mm->exe_file && + cpt_object_add(CPT_OBJ_FILE, mm->exe_file, ctx) == NULL) + return -ENOMEM; + +#ifdef CONFIG_BEANCOUNTERS + if (cpt_add_ubc(mm->mm_ub, ctx) == NULL) + return -ENOMEM; +#endif + + hlist_for_each_entry(aio_ctx, n, &mm->ioctx_list, list) { + int err; + + if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0) + return err; + } + + return 0; +} + +int cpt_collect_mm(cpt_context_t * ctx) +{ + cpt_object_t *obj; + int err; + int index; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL) + return -ENOMEM; + } + + index = 1; + for_each_object(obj, CPT_OBJ_MM) { + struct mm_struct *mm = obj->o_obj; + if (obj->o_count != atomic_read(&mm->mm_users)) { + eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users)); + return -EAGAIN; + } + cpt_obj_setindex(obj, index++, ctx); + + if ((err = collect_one_mm(mm, ctx)) != 0) + return err; + } + + return 0; +} + +static int zcnt, scnt, scnt0, ucnt; + +/* Function where_is_anon_page() returns address of a anonymous page in mm + * of already dumped process. This happens f.e. after fork(). We do not use + * this right now, just keep statistics, it is diffucult to restore such state, + * but the most direct use is to save space in dumped image. */ + + +static inline unsigned long +vma_address0(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + address |= 1; + return address; +} + +static int really_this_one(struct vm_area_struct *vma, unsigned long address, + struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + int result; + + pgd = pgd_offset(mm, address); + if (unlikely(!pgd_present(*pgd))) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (unlikely(!pmd_present(*pmd))) + return 0; + + result = 0; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + pte_unmap(pte); + return 0; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) + result = 1; + pte_unmap_unlock(pte, ptl); + return result; +} + +static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr, + struct page *page, cpt_context_t * ctx) +{ + loff_t mmptr = CPT_NULL; + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + int idx = mmobj->o_index; + + if (!PageAnon(page)) + return CPT_NULL; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return CPT_NULL; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + unsigned long addr = vma_address0(page, vma); + cpt_object_t *obj; + + /* We do not try to support mremapped regions (addr != mapaddr), + * only mmaps directly inherited via fork(). + * With this limitation we may check self-consistency of + * vmas (vm_start, vm_pgoff, anon_vma) before + * doing __copy_page_range() in rst_mm. + */ + if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) { + obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx); + if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) { + if (really_this_one(vma, addr, page)) { + mmptr = obj->o_pos; + idx = obj->o_index; + } + } + } + } + page_unlock_anon_vma(anon_vma); + + return mmptr; +} + +struct page_area +{ + int type; + unsigned long start; + unsigned long end; + pgoff_t pgoff; + loff_t mm; + __u64 list[16]; +}; + +struct page_desc +{ + int type; + pgoff_t index; + loff_t mm; + int shared; +}; + +enum { + PD_ABSENT, + PD_COPY, + PD_ZERO, + PD_CLONE, + PD_FUNKEY, + PD_LAZY, + PD_ITER, + PD_ITERYOUNG, +}; + +/* 0: page can be obtained from backstore, or still not mapped anonymous page, + or something else, which does not requre copy. + 1: page requires copy + 2: page requres copy but its content is zero. Quite useless. + 3: wp page is shared after fork(). It is to be COWed when modified. + 4: page is something unsupported... We copy it right now. + */ + + + +static void page_get_desc(cpt_object_t *mmobj, + struct vm_area_struct *vma, unsigned long addr, + struct page_desc *pdesc, cpt_context_t * ctx) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *pg = NULL; + pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff; + + pdesc->index = linear_index; + pdesc->shared = 0; + pdesc->mm = CPT_NULL; + + if (vma->vm_flags & VM_IO) { + pdesc->type = PD_ABSENT; + return; + } + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out_absent; + pud = pud_offset(pgd, addr); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out_absent; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out_absent; +#ifdef CONFIG_X86 + if (pmd_huge(*pmd)) { + eprintk_ctx("page_huge\n"); + goto out_unsupported; + } +#endif +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +retry: +#endif + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = *ptep; + pte_unmap(ptep); + + if (pte_none(pte)) + goto out_absent_unlock; + + if (!pte_present(pte)) { + if (pte_file(pte)) { + pdesc->index = pte_to_pgoff(pte); + goto out_absent_unlock; + } + if (vma->vm_flags & VM_SHARED) { + /* It is impossible: shared mappings cannot be in swap */ + eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos); + goto out_unsupported_unlock; + } +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + /* Otherwise it is in swap. */ + if (!ctx->lazy_vm) { + int err; + /* If lazy transfer is not enabled, + * raise it from swap now, so that we + * save at least when the page is shared. + */ + spin_unlock(ptl); + err = handle_mm_fault(mm, vma, addr, 0); + if (err == VM_FAULT_SIGBUS) + goto out_absent; + if (err == VM_FAULT_OOM) + goto out_absent; + err = 0; + goto retry; + } +#endif + pdesc->type = PD_LAZY; + goto out_unlock; + } + + if ((pg = vm_normal_page(vma, addr, pte)) == NULL) { + pdesc->type = PD_COPY; + goto out_unlock; + } + + get_page(pg); + spin_unlock(ptl); + + if (pg->mapping && !PageAnon(pg)) { + if (vma->vm_file == NULL) { + eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr); + goto out_unsupported; + } + if (vma->vm_file->f_mapping != pg->mapping) { + eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n", + addr, vma->vm_file->f_mapping, pg->mapping, + mmobj->o_pos); + goto out_unsupported; + } + pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + /* Page is in backstore. For us it is like + * it is not present. + */ + goto out_absent; + } + + if (PageReserved(pg)) { + /* Special case: ZERO_PAGE is used, when an + * anonymous page is accessed but not written. */ + if (pg == ZERO_PAGE(addr)) { + if (pte_write(pte)) { + eprintk_ctx("not funny already, writable ZERO_PAGE\n"); + goto out_unsupported; + } + zcnt++; + goto out_absent; + } + eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index, + addr, mmobj->o_pos); + goto out_unsupported; + } + + if (pg == ZERO_PAGE(addr)) { + wprintk_ctx("that's how it works now\n"); + } + + if (!pg->mapping) { + eprintk_ctx("page without mapping at %08lx@%Ld\n", addr, + mmobj->o_pos); + goto out_unsupported; + } + + if (pg->mapping && page_mapcount(pg) > 1) { + pdesc->shared = 1; + pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx); + if (pdesc->mm != CPT_NULL) { + scnt0++; + pdesc->type = PD_CLONE; + goto out_put; + } else { + scnt++; + } + } +#ifdef CONFIG_VZ_CHECKPOINT_ITER + if (ctx->iter_done && + test_bit(PG_checkpointed, &pg->flags)) { + if (pte_write(pte)) { + wprintk_ctx("writable PG_checkpointed page\n"); + } + pdesc->index = page_to_pfn(pg); + pdesc->type = pte_young(pte) ? PD_ITERYOUNG : PD_ITER; + goto out_put; + } +#endif + pdesc->type = pte_young(pte) ? PD_COPY : PD_LAZY; + +out_put: + if (pg) + put_page(pg); + return; + +out_unlock: + spin_unlock(ptl); + goto out_put; + +out_absent_unlock: + spin_unlock(ptl); +out_absent: + pdesc->type = PD_ABSENT; + goto out_put; + +out_unsupported_unlock: + spin_unlock(ptl); +out_unsupported: + ucnt++; + pdesc->type = PD_FUNKEY; + goto out_put; +} + +/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages() + * does not really need this thing. It just stores some page fault stats there. + * + * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages + * before accessing vma. + */ +void dump_pages(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct cpt_context *ctx) +{ +#define MAX_PAGE_BATCH 16 + struct page *pg[MAX_PAGE_BATCH]; + int npages = (end - start)/PAGE_SIZE; + int count = 0; + + while (count < npages) { + int copy = npages - count; + int n; + + if (copy > MAX_PAGE_BATCH) + copy = MAX_PAGE_BATCH; + n = get_user_pages(current, vma->vm_mm, start, copy, + 0, 1, pg, NULL); + if (n == copy) { + int i; + for (i=0; iwrite(maddr, PAGE_SIZE, ctx); + kunmap(pg[i]); + } + } else { + eprintk_ctx("get_user_pages fault"); + for ( ; n > 0; n--) + page_cache_release(pg[n-1]); + return; + } + start += n*PAGE_SIZE; + count += n; + for ( ; n > 0; n--) + page_cache_release(pg[n-1]); + } + return; +} + +int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb, + int copy, + struct cpt_context *ctx) +{ + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES; + pgb->cpt_hdrlen = sizeof(*pgb); + pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID; + + ctx->write(pgb, sizeof(*pgb), ctx); + if (copy == PD_COPY || copy == PD_LAZY) + dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa, + struct cpt_context *ctx) +{ + struct cpt_remappage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_REMAPPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1; + + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa, + struct cpt_context *ctx) +{ + struct cpt_copypage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_COPYPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + pgb.cpt_source = pa->mm; + + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa, + cpt_context_t *ctx) +{ + struct cpt_lazypage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_LAZYPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start, + (pa->end-pa->start)/PAGE_SIZE, ctx); +#endif + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_iterpage_block(struct vm_area_struct *vma, struct page_area *pa, + cpt_context_t *ctx) +{ + struct cpt_iterpage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = pa->type == PD_ITER ? CPT_OBJ_ITERPAGES : + CPT_OBJ_ITERYOUNGPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + ctx->write(&pgb, sizeof(pgb), ctx); + + ctx->write(pa->list, 8*((pa->end-pa->start)/PAGE_SIZE), ctx); + + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + + +static int can_expand(struct page_area *pa, struct page_desc *pd) +{ + if (pa->start == pa->end) + return 1; + if (pa->type != pd->type) + return 0; + if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) { + if (pa->end - pa->start >= PAGE_SIZE*16) + return 0; + pa->list[(pa->end - pa->start)/PAGE_SIZE] = pd->index; + } + if (pa->type == PD_ABSENT) + return pd->index == pa->pgoff + 1; + if (pa->type == PD_CLONE) + return pd->mm == pa->mm; + return 1; +} + +static int dump_one_vma(cpt_object_t *mmobj, + struct vm_area_struct *vma, struct cpt_context *ctx) +{ + struct cpt_vma_image *v = cpt_get_buf(ctx); + unsigned long addr; + loff_t saved_object; + struct cpt_page_block pgb; + struct page_area pa; + int cloned_pages = 0; + + cpt_push_object(&saved_object, ctx); + + v->cpt_object = CPT_OBJ_VMA; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_start = vma->vm_start; + v->cpt_end = vma->vm_end; + v->cpt_flags = vma->vm_flags; + if (vma->vm_flags&VM_HUGETLB) { + eprintk_ctx("huge TLB VMAs are still not supported\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_pgprot = vma->vm_page_prot.pgprot; + v->cpt_pgoff = vma->vm_pgoff; + v->cpt_file = CPT_NULL; +#ifndef CONFIG_IA64 + if ((void *)vma->vm_start == vma->vm_mm->context.vdso && + vma->vm_ops == &special_mapping_vmops) + v->cpt_type = CPT_VMA_VDSO; + else +#endif + v->cpt_type = CPT_VMA_TYPE_0; + v->cpt_anonvma = 0; + + /* We have to remember what VMAs are bound to one anon_vma. + * So, we store an identifier of group of VMAs. It is handy + * to use absolute address of anon_vma as this identifier. */ + v->cpt_anonvmaid = (unsigned long)vma->anon_vma; + + if (vma->vm_file) { + struct file *filp; + cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx); + if (obj == NULL) BUG(); + filp = obj->o_obj; + if (filp->f_op == &shm_file_operations) { + struct shm_file_data *sfd = filp->private_data; + + v->cpt_type = CPT_VMA_TYPE_SHM; + obj = lookup_cpt_object(CPT_OBJ_FILE, sfd->file, ctx); + } + v->cpt_file = obj->o_pos; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + if (v->cpt_type == CPT_VMA_VDSO) + goto out; + + pa.type = PD_ABSENT; + pa.pgoff = vma->vm_pgoff; + pa.mm = CPT_NULL; + pa.start = vma->vm_start; + pa.end = vma->vm_start; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + struct page_desc pd; + + page_get_desc(mmobj, vma, addr, &pd, ctx); + cloned_pages += pd.shared; + + if (pd.type == PD_FUNKEY) { + eprintk_ctx("dump_one_vma: funkey page\n"); + return -EINVAL; + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (pd.type == PD_LAZY && + (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED))) + pd.type = PD_COPY; +#else + if (pd.type == PD_LAZY) + pd.type = PD_COPY; +#endif + + if (!can_expand(&pa, &pd)) { + if (pa.type == PD_COPY || + pa.type == PD_ZERO) { + pgb.cpt_start = pa.start; + pgb.cpt_end = pa.end; + dump_page_block(vma, &pgb, pa.type, ctx); + } else if (pa.type == PD_CLONE) { + dump_copypage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_LAZY) { + dump_lazypage_block(vma, &pa, ctx); + } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { + dump_iterpage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_ABSENT && + pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { + dump_remappage_block(vma, &pa, ctx); + } + pa.start = addr; + } + pa.type = pd.type; + pa.end = addr + PAGE_SIZE; + pa.pgoff = pd.index; + if (addr == pa.start) + pa.list[0] = pd.index; + pa.mm = pd.mm; + } + + if (pa.end > pa.start) { + if (pa.type == PD_COPY || + pa.type == PD_ZERO) { + pgb.cpt_start = pa.start; + pgb.cpt_end = pa.end; + dump_page_block(vma, &pgb, pa.type, ctx); + } else if (pa.type == PD_CLONE) { + dump_copypage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_LAZY) { + dump_lazypage_block(vma, &pa, ctx); + } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { + dump_iterpage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_ABSENT && + pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { + dump_remappage_block(vma, &pa, ctx); + } + } + + if (cloned_pages) { + __u32 anonvma = 1; + loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma); + ctx->pwrite(&anonvma, 4, ctx, anonpos); + } + +out: + cpt_close_object(ctx); + + cpt_pop_object(&saved_object, ctx); + + return 0; +} + +static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, + cpt_context_t *ctx) +{ + loff_t saved_object; + struct cpt_aio_ctx_image aimg; + + if (!list_empty(&aio_ctx->run_list) || + !list_empty(&aio_ctx->active_reqs) || + aio_ctx->reqs_active) { + eprintk_ctx("AIO is active after suspend\n"); + return -EBUSY; + } + + cpt_push_object(&saved_object, ctx); + + aimg.cpt_next = CPT_ALIGN(sizeof(aimg)); + aimg.cpt_object = CPT_OBJ_AIO_CONTEXT; + aimg.cpt_hdrlen = sizeof(aimg); + aimg.cpt_content = CPT_CONTENT_ARRAY; + + aimg.cpt_max_reqs = aio_ctx->max_reqs; + aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages; + aimg.cpt_nr = aio_ctx->ring_info.nr; + aimg.cpt_tail = aio_ctx->ring_info.tail; + aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base; + + ctx->write(&aimg, sizeof(aimg), ctx); + + cpt_pop_object(&saved_object, ctx); + return 0; +} + +static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct mm_struct *mm = obj->o_obj; + struct vm_area_struct *vma; + struct cpt_mm_image *v = cpt_get_buf(ctx); + struct kioctx *aio_ctx; + struct hlist_node *n; + + cpt_open_object(obj, ctx); + + v->cpt_next = -1; + v->cpt_object = CPT_OBJ_MM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_start_code = mm->start_code; + v->cpt_end_code = mm->end_code; + v->cpt_start_data = mm->start_data; + v->cpt_end_data = mm->end_data; + v->cpt_start_brk = mm->start_brk; + v->cpt_brk = mm->brk; + v->cpt_start_stack = mm->start_stack; + v->cpt_start_arg = mm->arg_start; + v->cpt_end_arg = mm->arg_end; + v->cpt_start_env = mm->env_start; + v->cpt_end_env = mm->env_end; + v->cpt_def_flags = mm->def_flags; +#ifdef CONFIG_BEANCOUNTERS + v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx); +#endif + /* FIXME when coredump mask exceeds 8 bits */ + WARN_ON(mm->flags >> 8); + v->cpt_dumpable = mm->flags; + v->cpt_vps_dumpable = mm->vps_dumpable; + v->cpt_used_hugetlb = 0; /* not used */ +#ifndef CONFIG_IA64 + v->cpt_vdso = (__u32)(unsigned long)mm->context.vdso; +#endif + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + +#ifdef CONFIG_X86 + if (mm->context.size) { + loff_t saved_object; + struct cpt_obj_bits b; + int size; + + dprintk_ctx("nontrivial LDT\n"); + + cpt_push_object(&saved_object, ctx); + + cpt_open_object(NULL, ctx); + b.cpt_next = CPT_NULL; + b.cpt_object = CPT_OBJ_BITS; + b.cpt_hdrlen = sizeof(b); + b.cpt_content = CPT_CONTENT_MM_CONTEXT; + b.cpt_size = mm->context.size*LDT_ENTRY_SIZE; + + ctx->write(&b, sizeof(b), ctx); + + size = mm->context.size*LDT_ENTRY_SIZE; + +#if defined(CONFIG_X86_64) || defined(CONFIG_XEN) || \ + LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19) + ctx->write(mm->context.ldt, size, ctx); +#else + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + ctx->write(kaddr, bytes, ctx); + kunmap(mm->context.ldt_pages[nr]); + } +#endif + + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + } +#endif + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + int err; + + if ((err = dump_one_vma(obj, vma, ctx)) != 0) + return err; + } + + hlist_for_each_entry(aio_ctx, n, &mm->ioctx_list, list) { + int err; + + if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0) + return err; + } + + cpt_close_object(ctx); + + return 0; +} + +int cpt_dump_vm(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + scnt = scnt0 = zcnt = 0; + + cpt_open_section(ctx, CPT_SECT_MM); + + for_each_object(obj, CPT_OBJ_MM) { + int err; + + if ((err = dump_one_mm(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + + if (scnt) + dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt); + if (scnt0) + dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0); + if (zcnt) + dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt); + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_mm.h linux-2.6.32.48-openvz/kernel/cpt/cpt_mm.h --- linux-2.6.32.48/kernel/cpt/cpt_mm.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_mm.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,35 @@ +int cpt_collect_mm(cpt_context_t *); + +int cpt_dump_vm(struct cpt_context *ctx); + +__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx); + +int cpt_mm_prepare(unsigned long veid); + +int cpt_free_pgin_dir(struct cpt_context *); +int cpt_start_pagein(struct cpt_context *); +int rst_setup_pagein(struct cpt_context *); +int rst_complete_pagein(struct cpt_context *, int); +int rst_pageind(struct cpt_context *); +int cpt_iteration(cpt_context_t *ctx); +int rst_iteration(cpt_context_t *ctx); +void rst_drop_iter_dir(cpt_context_t *ctx); +int rst_iter(struct vm_area_struct *vma, u64 pfn, + unsigned long addr, cpt_context_t * ctx); + +int rst_swapoff(struct cpt_context *); + +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES +struct linux_binprm; +extern int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, + unsigned long map_address); +#endif + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +extern struct page *vdso32_pages[1]; +#define vsyscall_addr page_address(vdso32_pages[0]) +#endif + +extern struct vm_operations_struct special_mapping_vmops; diff -urNp linux-2.6.32.48/kernel/cpt/cpt_net.c linux-2.6.32.48-openvz/kernel/cpt/cpt_net.c --- linux-2.6.32.48/kernel/cpt/cpt_net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_net.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,652 @@ +/* + * + * kernel/cpt/cpt_net.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" +#include "cpt_syscalls.h" + +static void cpt_dump_netstats(struct net_device *dev, struct cpt_context * ctx) +{ + struct cpt_netstats_image *n; + struct net_device_stats *stats; + + if (!dev->netdev_ops->ndo_get_stats) + return; + + n = cpt_get_buf(ctx); + stats = dev->netdev_ops->ndo_get_stats(dev); + cpt_open_object(NULL, ctx); + + n->cpt_next = CPT_NULL; + n->cpt_object = CPT_OBJ_NET_STATS; + n->cpt_hdrlen = sizeof(*n); + n->cpt_content = CPT_CONTENT_VOID; + + n->cpt_rx_packets = stats->rx_packets; + n->cpt_tx_packets = stats->tx_packets; + n->cpt_rx_bytes = stats->rx_bytes; + n->cpt_tx_bytes = stats->tx_bytes; + n->cpt_rx_errors = stats->rx_errors; + n->cpt_tx_errors = stats->tx_errors; + n->cpt_rx_dropped = stats->rx_dropped; + n->cpt_tx_dropped = stats->tx_dropped; + n->cpt_multicast = stats->multicast; + n->cpt_collisions = stats->collisions; + n->cpt_rx_length_errors = stats->rx_length_errors; + n->cpt_rx_over_errors = stats->rx_over_errors; + n->cpt_rx_crc_errors = stats->rx_crc_errors; + n->cpt_rx_frame_errors = stats->rx_frame_errors; + n->cpt_rx_fifo_errors = stats->rx_fifo_errors; + n->cpt_rx_missed_errors = stats->rx_missed_errors; + n->cpt_tx_aborted_errors = stats->tx_aborted_errors; + n->cpt_tx_carrier_errors = stats->tx_carrier_errors; + n->cpt_tx_fifo_errors = stats->tx_fifo_errors; + n->cpt_tx_heartbeat_errors = stats->tx_heartbeat_errors; + n->cpt_tx_window_errors = stats->tx_window_errors; + n->cpt_rx_compressed = stats->rx_compressed; + n->cpt_tx_compressed = stats->tx_compressed; + + ctx->write(n, sizeof(*n), ctx); + cpt_close_object(ctx); + cpt_release_buf(ctx); + return; +} + +int cpt_dump_link(struct cpt_context * ctx) +{ + struct net *net = get_exec_env()->ve_netns; + struct net_device *dev; + + cpt_open_section(ctx, CPT_SECT_NET_DEVICE); + for_each_netdev(net, dev) { + struct cpt_netdev_image v; + struct cpt_hwaddr_image hw; + loff_t saved_obj; + + if (dev->netdev_ops->ndo_cpt == NULL) { + eprintk_ctx("unsupported netdev %s\n", dev->name); + cpt_close_section(ctx); + return -EBUSY; + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_DEVICE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + v.cpt_index = dev->ifindex; + v.cpt_flags = dev->flags; + memcpy(v.cpt_name, dev->name, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + + cpt_open_object(NULL, ctx); + dev->netdev_ops->ndo_cpt(dev, &cpt_ops, ctx); + + /* Dump hardware address */ + cpt_open_object(NULL, ctx); + hw.cpt_next = CPT_NULL; + hw.cpt_object = CPT_OBJ_NET_HWADDR; + hw.cpt_hdrlen = sizeof(hw); + hw.cpt_content = CPT_CONTENT_VOID; + + if (dev->dev_addrs.count != 1) { + eprintk_ctx("multiple hwaddrs on %s\n", dev->name); + return -EINVAL; + } + + BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) != MAX_ADDR_LEN); + memcpy(hw.cpt_dev_addr, dev->dev_addr, sizeof(hw.cpt_dev_addr)); + ctx->write(&hw, sizeof(hw), ctx); + cpt_close_object(ctx); + + cpt_dump_netstats(dev, ctx); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + } + cpt_close_section(ctx); + return 0; +} + +int cpt_suspend_network(struct cpt_context *ctx) +{ + get_exec_env()->disable_net = 1; + synchronize_net(); + return 0; +} + +int cpt_resume_network(struct cpt_context *ctx) +{ + struct ve_struct *env; + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + env->disable_net = 0; + put_ve(env); + return 0; +} + +int cpt_dump_ifaddr(struct cpt_context * ctx) +{ + struct net *net = get_exec_env()->ve_netns; + struct net_device *dev; + + cpt_open_section(ctx, CPT_SECT_NET_IFADDR); + for_each_netdev(net, dev) { + struct in_device *idev = in_dev_get(dev); + struct in_ifaddr *ifa; + + if (!idev) + continue; + + for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) { + struct cpt_ifaddr_image v; + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IFADDR; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_family = AF_INET; + v.cpt_masklen = ifa->ifa_prefixlen; + v.cpt_flags = ifa->ifa_flags; + v.cpt_scope = ifa->ifa_scope; + memset(&v.cpt_address, 0, sizeof(v.cpt_address)); + memset(&v.cpt_peer, 0, sizeof(v.cpt_peer)); + memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); + v.cpt_address[0] = ifa->ifa_local; + v.cpt_peer[0] = ifa->ifa_address; + v.cpt_broadcast[0] = ifa->ifa_broadcast; + memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + } + in_dev_put(idev); + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + for_each_netdev(net, dev) { + struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_ifaddr *ifa; + + if (!idev) + continue; + + for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { + struct cpt_ifaddr_image v; + + if (dev == net->loopback_dev && + ifa->prefix_len == 128 && + ifa->addr.s6_addr32[0] == 0 && + ifa->addr.s6_addr32[1] == 0 && + ifa->addr.s6_addr32[2] == 0 && + ifa->addr.s6_addr32[3] == htonl(1)) + continue; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IFADDR; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_family = AF_INET6; + v.cpt_masklen = ifa->prefix_len; + v.cpt_flags = ifa->flags; + v.cpt_scope = ifa->scope; + v.cpt_valid_lft = ifa->valid_lft; + v.cpt_prefered_lft = ifa->prefered_lft; + memcpy(&v.cpt_address, &ifa->addr, 16); + memcpy(&v.cpt_peer, &ifa->addr, 16); + memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); + memcpy(v.cpt_label, dev->name, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + } + in6_dev_put(idev); + } +#endif + cpt_close_section(ctx); + return 0; +} + +#ifdef CONFIG_IP_FIB_TRIE +#error "Trie fib rules are known not to be restored proprly yet" +#endif + +static int cpt_dump_route(struct cpt_context * ctx) +{ + int err; + struct socket *sock; + struct msghdr msg; + struct iovec iov; + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + struct sockaddr_nl nladdr; + struct cpt_object_hdr v; + mm_segment_t oldfs; + char *pg; + + err = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); + if (err) + return err; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETROUTE; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.g.rtgen_family = AF_INET; + + iov.iov_base=&req; + iov.iov_len=sizeof(req); + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, sizeof(req)); + set_fs(oldfs); + + if (err < 0) + goto out_sock; + + pg = (char*)__get_free_page(GFP_KERNEL); + if (pg == NULL) { + err = -ENOMEM; + goto out_sock; + } + + cpt_open_section(ctx, CPT_SECT_NET_ROUTE); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_ROUTE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NLMARRAY; + + ctx->write(&v, sizeof(v), ctx); + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +restart: +#endif + for (;;) { + struct nlmsghdr *h; + + iov.iov_base = pg; + iov.iov_len = PAGE_SIZE; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); + set_fs(oldfs); + + if (err < 0) + goto out_sock_pg; + if (msg.msg_flags & MSG_TRUNC) { + err = -ENOBUFS; + goto out_sock_pg; + } + + h = (struct nlmsghdr*)pg; + while (NLMSG_OK(h, err)) { + if (h->nlmsg_type == NLMSG_DONE) { + err = 0; + goto done; + } + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h); + err = errm->error; + eprintk_ctx("NLMSG error: %d\n", errm->error); + goto done; + } + if (h->nlmsg_type != RTM_NEWROUTE) { + eprintk_ctx("NLMSG: %d\n", h->nlmsg_type); + err = -EINVAL; + goto done; + } + ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx); + h = NLMSG_NEXT(h, err); + } + if (err) { + eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type); + err = -EINVAL; + break; + } + } +done: +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (!err && req.g.rtgen_family == AF_INET) { + req.g.rtgen_family = AF_INET6; + iov.iov_base=&req; + iov.iov_len=sizeof(req); + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, sizeof(req)); + set_fs(oldfs); + + if (err > 0) + goto restart; + } +#endif + ctx->align(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + +out_sock_pg: + free_page((unsigned long)pg); +out_sock: + sock_release(sock); + return err; +} + +struct args_t +{ + int* pfd; + envid_t veid; +}; + +static int dumpfn(void *arg) +{ + int i; + struct args_t *args = arg; + int *pfd = args->pfd; + char *argv[] = { "iptables-save", "-c", NULL }; + + i = real_env_create(args->veid, VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump iptables\n"); + module_put(THIS_MODULE); + return 255 << 8; + } + + if (pfd[1] != 1) + sc_dup2(pfd[1], 1); + + for (i=0; ifiles->fdt->max_fds; i++) { + if (i != 1) + sc_close(i); + } + + module_put(THIS_MODULE); + + set_fs(KERNEL_DS); + i = sc_execve("/sbin/iptables-save", argv, NULL); + if (i == -ENOENT) + i = sc_execve("/usr/sbin/iptables-save", argv, NULL); + eprintk("failed to exec iptables-save: %d\n", i); + return 255 << 8; +} + + +static int cpt_dump_iptables(struct cpt_context * ctx) +{ + int err = 0; +#ifdef CONFIG_VE_IPTABLES + int pid; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + char buf[16]; + loff_t pos; + int n; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + struct args_t args; + struct ve_struct *oldenv; + + if (!(get_exec_env()->_iptables_modules & VE_IP_IPTABLES_MOD)) + return 0; + + err = sc_pipe(pfd); + if (err < 0) { + eprintk_ctx("sc_pipe: %d\n", err); + return err; + } + args.pfd = pfd; + args.veid = VEID(get_exec_env()); + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + oldenv = set_exec_env(get_ve0()); + err = pid = local_kernel_thread(dumpfn, (void*)&args, + SIGCHLD | CLONE_VFORK, 0); + set_exec_env(oldenv); + if (err < 0) { + eprintk_ctx("local_kernel_thread: %d\n", err); + goto out; + } + + f = fget(pfd[0]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + cpt_open_section(ctx, CPT_SECT_NET_IPTABLES); + + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NAME; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&v, sizeof(v), ctx); + + pos = ctx->file->f_pos; + do { + oldfs = get_fs(); set_fs(KERNEL_DS); + n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); + set_fs(oldfs); + if (n > 0) + ctx->write(buf, n, ctx); + } while (n > 0); + + if (n < 0) + eprintk_ctx("read: %d\n", n); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-save exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("iptables-save terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + if (ctx->file->f_pos != pos) { + buf[0] = 0; + ctx->write(buf, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + } else { + pos = ctx->current_section; + cpt_close_object(ctx); + cpt_close_section(ctx); + ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL; + ctx->file->f_pos = pos; + } + return n ? : err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); +#endif + return err; +} + +static unsigned long fold_field(void *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for_each_possible_cpu(i) { + res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} + +static void cpt_dump_snmp_stat(struct cpt_context *ctx, void *mib[], int n) +{ + int i; + struct cpt_object_hdr o; + __u32 *stats; + + stats = cpt_get_buf(ctx); + + cpt_open_object(NULL, ctx); + + for (i = 0; i < n; i++) + stats[i] = fold_field(mib, i); + + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_BITS; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_DATA; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(stats, n * sizeof(*stats), ctx); + ctx->align(ctx); + + cpt_close_object(ctx); + + cpt_release_buf(ctx); +} + +static void cpt_dump_snmp_stub(struct cpt_context *ctx) +{ + struct cpt_object_hdr o; + + cpt_open_object(NULL, ctx); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_BITS; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_VOID; + ctx->write(&o, sizeof(o), ctx); + ctx->align(ctx); + cpt_close_object(ctx); +} + +static int cpt_dump_snmp(struct cpt_context *ctx) +{ + struct ve_struct *ve; + struct net *net; + + ve = get_exec_env(); + net = ve->ve_netns; + + cpt_open_section(ctx, CPT_SECT_SNMP_STATS); + + cpt_dump_snmp_stat(ctx, (void **)&net->mib.net_statistics, + LINUX_MIB_MAX); + cpt_dump_snmp_stat(ctx, (void **)&net->mib.ip_statistics, + IPSTATS_MIB_MAX); + cpt_dump_snmp_stat(ctx, (void **)&net->mib.tcp_statistics, + TCP_MIB_MAX); + cpt_dump_snmp_stat(ctx, (void **)&net->mib.udp_statistics, + UDP_MIB_MAX); + cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmp_statistics, + ICMP_MIB_MAX); + cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmpmsg_statistics, + ICMPMSG_MIB_MAX); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + cpt_dump_snmp_stat(ctx, (void **)&net->mib.ipv6_statistics, + IPSTATS_MIB_MAX); + cpt_dump_snmp_stat(ctx, (void **)&net->mib.udp_stats_in6, + UDP_MIB_MAX); + cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmpv6_statistics, + ICMP6_MIB_MAX); +#else + cpt_dump_snmp_stub(ctx); + cpt_dump_snmp_stub(ctx); + cpt_dump_snmp_stub(ctx); +#endif + cpt_close_section(ctx); + + return 0; +} + +int cpt_dump_ifinfo(struct cpt_context * ctx) +{ + int err; + + rtnl_lock(); + err = cpt_dump_link(ctx); + if (!err) + err = cpt_dump_ifaddr(ctx); + rtnl_unlock(); + if (!err) + err = cpt_dump_route(ctx); + if (!err) + err = cpt_dump_iptables(ctx); + if (!err) + err = cpt_dump_snmp(ctx); + return err; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_net.h linux-2.6.32.48-openvz/kernel/cpt/cpt_net.h --- linux-2.6.32.48/kernel/cpt/cpt_net.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_net.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,7 @@ +int cpt_dump_ifinfo(struct cpt_context *ctx); +int rst_restore_net(struct cpt_context *ctx); +int cpt_suspend_network(struct cpt_context *ctx); +int cpt_resume_network(struct cpt_context *ctx); +int rst_resume_network(struct cpt_context *ctx); +int cpt_dump_ip_conntrack(struct cpt_context *ctx); +int rst_restore_ip_conntrack(struct cpt_context * ctx); diff -urNp linux-2.6.32.48/kernel/cpt/cpt_obj.c linux-2.6.32.48-openvz/kernel/cpt/cpt_obj.c --- linux-2.6.32.48/kernel/cpt/cpt_obj.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_obj.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,163 @@ +/* + * + * kernel/cpt/cpt_obj.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = kmalloc(sizeof(cpt_object_t), gfp); + if (obj) { + INIT_LIST_HEAD(&obj->o_list); + INIT_LIST_HEAD(&obj->o_hash); + INIT_LIST_HEAD(&obj->o_alist); + obj->o_count = 1; + obj->o_pos = CPT_NULL; + obj->o_lock = 0; + obj->o_parent = NULL; + obj->o_index = CPT_NOINDEX; + obj->o_obj = NULL; + obj->o_image = NULL; + obj->o_flags = 0; + ctx->objcount++; + } + return obj; +} + +void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx) +{ + list_del(&obj->o_alist); + kfree(obj); + ctx->objcount--; +} + +void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx) +{ + list_add_tail(&obj->o_list, &ctx->object_array[type]); +} + +void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, + cpt_object_t *head, cpt_context_t *ctx) +{ + list_add(&obj->o_list, &head->o_list); +} + +cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p, + unsigned gfp_mask, cpt_context_t *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + + if (obj) { + obj->o_count++; + return obj; + } + + if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) { + if (p) + cpt_obj_setobj(obj, p, ctx); + intern_cpt_object(type, obj, ctx); + return obj; + } + return NULL; +} + +cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx) +{ + return __cpt_object_add(type, p, GFP_KERNEL, ctx); +} + +cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + + if (obj) + obj->o_count++; + + return obj; +} + +int cpt_object_init(cpt_context_t *ctx) +{ + int i; + + for (i=0; iobject_array[i]); + } + return 0; +} + +int cpt_object_destroy(cpt_context_t *ctx) +{ + int i; + + for (i=0; iobject_array[i])) { + struct list_head *head = ctx->object_array[i].next; + cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); + list_del(head); + if (obj->o_image) + kfree(obj->o_image); + free_cpt_object(obj, ctx); + } + } + if (ctx->objcount != 0) + eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount); + return 0; +} + +cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_obj == p) + return obj; + } + return NULL; +} + +cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_pos == pos) + return obj; + } + return NULL; +} + +cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_index == index) + return obj; + } + return NULL; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_obj.h linux-2.6.32.48-openvz/kernel/cpt/cpt_obj.h --- linux-2.6.32.48/kernel/cpt/cpt_obj.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_obj.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,64 @@ +#ifndef __CPT_OBJ_H_ +#define __CPT_OBJ_H_ 1 + +#include +#include + +typedef struct _cpt_object +{ + struct list_head o_list; + struct list_head o_hash; + int o_count; + int o_index; + int o_lock; + loff_t o_pos; + loff_t o_ppos; + void *o_obj; + void *o_image; + void *o_parent; + struct list_head o_alist; + unsigned int o_flags; +#define CPT_INODE_HARDLINKED 0x1 +} cpt_object_t; + +struct cpt_context; + +#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list) + + +extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx); +extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx); + +cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx); +cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx); +cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx); + +static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx) +{ + cpt->o_pos = pos; + /* Add to pos hash table */ +} + +static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx) +{ + cpt->o_obj = ptr; + /* Add to hash table */ +} + +static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx) +{ + cpt->o_index = index; + /* Add to index hash table */ +} + + +extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx); +extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx); +extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx); +extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx); +extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx); + +extern int cpt_object_init(struct cpt_context *ctx); +extern int cpt_object_destroy(struct cpt_context *ctx); + +#endif /* __CPT_OBJ_H_ */ diff -urNp linux-2.6.32.48/kernel/cpt/cpt_proc.c linux-2.6.32.48-openvz/kernel/cpt/cpt_proc.c --- linux-2.6.32.48/kernel/cpt/cpt_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_proc.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,623 @@ +/* + * + * kernel/cpt/cpt_proc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" + +MODULE_AUTHOR("Alexey Kuznetsov "); +MODULE_LICENSE("GPL"); + +/* List of contexts and lock protecting the list */ +static struct list_head cpt_context_list; +static spinlock_t cpt_context_lock; + +static int proc_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + cpt_context_t *ctx; + + len += sprintf(buffer, "Ctx Id VE State\n"); + + spin_lock(&cpt_context_lock); + + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + len += sprintf(buffer+len,"%p %08x %-8u %d", + ctx, + ctx->contextid, + ctx->ve_id, + ctx->ctx_state + ); + + buffer[len++] = '\n'; + + pos = begin+len; + if (pos < offset) { + len = 0; + begin = pos; + } + if (pos > offset+length) + goto done; + } + *eof = 1; + +done: + spin_unlock(&cpt_context_lock); + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) + len = length; + if(len < 0) + len = 0; + return len; +} + +void cpt_context_release(cpt_context_t *ctx) +{ + int i; + + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + + if (ctx->ctx_state > 0) + cpt_resume(ctx); + ctx->ctx_state = CPT_CTX_ERROR; + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) + put_task_struct(ctx->pgin_task); + if (ctx->pgin_dir) + cpt_free_pgin_dir(ctx); + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); +#endif + if (ctx->objcount) + eprintk_ctx("%d objects leaked\n", ctx->objcount); + if (ctx->file) + fput(ctx->file); + cpt_flush_error(ctx); + if (ctx->errorfile) { + fput(ctx->errorfile); + ctx->errorfile = NULL; + } + for (i = 0; i < ctx->linkdirs_num; i++) + fput(ctx->linkdirs[i]); + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; + } + if (ctx->statusfile) + fput(ctx->statusfile); + if (ctx->lockfile) + fput(ctx->lockfile); + kfree(ctx); + + spin_lock(&cpt_context_lock); +} + +static void __cpt_context_put(cpt_context_t *ctx) +{ + if (!--ctx->refcount) + cpt_context_release(ctx); +} + +static void cpt_context_put(cpt_context_t *ctx) +{ + spin_lock(&cpt_context_lock); + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); +} + +cpt_context_t * cpt_context_open(void) +{ + cpt_context_t *ctx; + + if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { + cpt_context_init(ctx); + spin_lock(&cpt_context_lock); + list_add_tail(&ctx->ctx_list, &cpt_context_list); + spin_unlock(&cpt_context_lock); + ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); + if (ctx->error_msg != NULL) + ctx->error_msg[0] = 0; + } + return ctx; +} + +static cpt_context_t * cpt_context_lookup(unsigned int contextid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->contextid == contextid) { + ctx->refcount++; + spin_unlock(&cpt_context_lock); + return ctx; + } + } + spin_unlock(&cpt_context_lock); + return NULL; +} + +int cpt_context_lookup_veid(unsigned int veid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->ve_id == veid && ctx->ctx_state > 0) { + spin_unlock(&cpt_context_lock); + return 1; + } + } + spin_unlock(&cpt_context_lock); + return 0; +} + +static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + int err = 0; + cpt_context_t *ctx; + struct file *dfile = NULL; + int try; + + unlock_kernel(); + + if (cmd == CPT_VMPREP) { +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = cpt_mm_prepare(arg); +#else + err = -EINVAL; +#endif + goto out_lock; + } + + if (cmd == CPT_TEST_CAPS) { + unsigned int src_flags, dst_flags = arg; + + err = 0; + src_flags = test_cpu_caps_and_features(); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); + goto out_lock; + } + + if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { + cpt_context_t *old_ctx; + + ctx = NULL; + if (cmd == CPT_JOIN_CONTEXT) { + err = -ENOENT; + ctx = cpt_context_lookup(arg); + if (!ctx) + goto out_lock; + } + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + file->private_data = ctx; + + if (old_ctx) { + if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { + old_ctx->sticky = 0; + old_ctx->refcount--; + } + __cpt_context_put(old_ctx); + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_lock; + } + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + if (ctx) + ctx->refcount++; + spin_unlock(&cpt_context_lock); + + if (!ctx) { + cpt_context_t *old_ctx; + + err = -ENOMEM; + ctx = cpt_context_open(); + if (!ctx) + goto out_lock; + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + if (!old_ctx) { + ctx->refcount++; + file->private_data = ctx; + } else { + old_ctx->refcount++; + } + if (old_ctx) { + __cpt_context_put(ctx); + ctx = old_ctx; + } + spin_unlock(&cpt_context_lock); + } + + if (cmd == CPT_GET_CONTEXT) { + unsigned int contextid = (unsigned int)arg; + + if (ctx->contextid && ctx->contextid != contextid) { + err = -EINVAL; + goto out_nosem; + } + if (!ctx->contextid) { + cpt_context_t *c1 = cpt_context_lookup(contextid); + if (c1) { + cpt_context_put(c1); + err = -EEXIST; + goto out_nosem; + } + ctx->contextid = contextid; + } + spin_lock(&cpt_context_lock); + if (!ctx->sticky) { + ctx->sticky = 1; + ctx->refcount++; + } + spin_unlock(&cpt_context_lock); + goto out_nosem; + } + + down(&ctx->main_sem); + + err = -EBUSY; + if (ctx->ctx_state < 0) + goto out; + + err = 0; + switch (cmd) { + case CPT_SET_DUMPFD: + if (ctx->ctx_state == CPT_CTX_DUMPING) { + err = -EBUSY; + break; + } + if (arg >= 0) { + err = -EBADF; + dfile = fget(arg); + if (dfile == NULL) + break; + if (dfile->f_op == NULL || + dfile->f_op->write == NULL) { + fput(dfile); + break; + } + err = 0; + } + if (ctx->file) + fput(ctx->file); + ctx->file = dfile; + break; + case CPT_LINKDIR_ADD: + if (ctx->linkdirs_num >= CPT_MAX_LINKDIRS) { + err = -EMLINK; + break; + } + + dfile = fget(arg); + if (!dfile) { + err = -EBADFD; + break; + } + + if (!S_ISDIR(dfile->f_dentry->d_inode->i_mode)) { + err = -ENOTDIR; + fput(dfile); + break; + } + + ctx->linkdirs[ctx->linkdirs_num++] = dfile; + break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->errorfile) + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + case CPT_SET_PAGEINFDIN: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + ctx->pagein_file_in = dfile; + break; + case CPT_SET_PAGEINFDOUT: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + ctx->pagein_file_out = dfile; + break; + case CPT_SET_LAZY: + ctx->lazy_vm = arg; + break; + case CPT_ITER: + err = cpt_iteration(ctx); + break; + case CPT_PAGEIND: + err = cpt_start_pagein(ctx); + break; +#endif + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ve_id = arg; + break; + case CPT_SET_CPU_FLAGS: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->dst_cpu_flags = arg; + ctx->src_cpu_flags = test_cpu_caps_and_features(); + break; + case CPT_SUSPEND: + if (cpt_context_lookup_veid(ctx->ve_id) || + ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ctx_state = CPT_CTX_SUSPENDING; + try = 0; + do { + err = cpt_vps_suspend(ctx); + if (err) + cpt_resume(ctx); + if (err == -EAGAIN) + msleep(1000); + try++; + } while (err == -EAGAIN && try < 3); + if (err) { + ctx->ctx_state = CPT_CTX_IDLE; + } else { + ctx->ctx_state = CPT_CTX_SUSPENDED; + } + break; + case CPT_DUMP: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + if (!ctx->file) { + err = -EBADF; + break; + } + err = cpt_dump(ctx); + break; + case CPT_RESUME: + if (ctx->ctx_state == CPT_CTX_IDLE) { + err = -ENOENT; + break; + } + err = cpt_resume(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_KILL: + if (ctx->ctx_state == CPT_CTX_IDLE) { + err = -ENOENT; + break; + } + err = cpt_kill(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_TEST_VECAPS: + { + __u32 dst_flags = arg; + __u32 src_flags; + + err = cpt_vps_caps(ctx, &src_flags); + if (err) + break; + + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL, "syscall", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL32, "syscall32", err); + if (dst_flags & (1 << CPT_SLM_DMPRST)) { + eprintk_ctx("SLM is enabled on destination node, but slm_dmprst module is not loaded\n"); + err = 1; + } + + if (src_flags & CPT_UNSUPPORTED_MASK) + err = 2; + break; + } + default: + err = -EINVAL; + break; + } + +out: + cpt_flush_error(ctx); + up(&ctx->main_sem); +out_nosem: + cpt_context_put(ctx); +out_lock: + lock_kernel(); + if (err == -ERESTARTSYS || err == -ERESTARTNOINTR || + err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK) + err = -EINTR; + return err; +} + +static int cpt_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int cpt_release(struct inode * inode, struct file * file) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + file->private_data = NULL; + + if (ctx) + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); + + module_put(THIS_MODULE); + return 0; +} + + +static struct file_operations cpt_fops = { + .owner = THIS_MODULE, + .open = cpt_open, + .release = cpt_release, + .ioctl = cpt_ioctl, +}; + +static struct proc_dir_entry *proc_ent; + +static struct ctl_table_header *ctl_header; + +static ctl_table debug_table[] = { + { + .procname = "cpt", + .data = &debug_level, + .maxlen = sizeof(debug_level), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { .ctl_name = 0 } +}; + +static int __init init_cpt(void) +{ + int err; + + err = -ENOMEM; + ctl_header = register_sysctl_table(root_table); + if (!ctl_header) + goto err_mon; + + spin_lock_init(&cpt_context_lock); + INIT_LIST_HEAD(&cpt_context_list); + + err = -EINVAL; + proc_ent = proc_create("cpt", 0600, NULL, NULL); + if (!proc_ent) + goto err_out; + + cpt_fops.read = proc_ent->proc_fops->read; + cpt_fops.write = proc_ent->proc_fops->write; + cpt_fops.llseek = proc_ent->proc_fops->llseek; + proc_ent->proc_fops = &cpt_fops; + + proc_ent->read_proc = proc_read; + proc_ent->data = NULL; + return 0; + +err_out: + unregister_sysctl_table(ctl_header); +err_mon: + return err; +} +module_init(init_cpt); + +static void __exit exit_cpt(void) +{ + remove_proc_entry("cpt", NULL); + unregister_sysctl_table(ctl_header); + + spin_lock(&cpt_context_lock); + while (!list_empty(&cpt_context_list)) { + cpt_context_t *ctx; + ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); + + if (!ctx->sticky) + ctx->refcount++; + ctx->sticky = 0; + + BUG_ON(ctx->refcount != 1); + + __cpt_context_put(ctx); + } + spin_unlock(&cpt_context_lock); +} +module_exit(exit_cpt); diff -urNp linux-2.6.32.48/kernel/cpt/cpt_process.c linux-2.6.32.48-openvz/kernel/cpt/cpt_process.c --- linux-2.6.32.48/kernel/cpt/cpt_process.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_process.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,1380 @@ +/* + * + * kernel/cpt/cpt_process.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_ubc.h" +#include "cpt_process.h" +#include "cpt_kernel.h" + +#ifdef CONFIG_X86_32 +#undef task_pt_regs +#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1) +#endif + +int check_task_state(struct task_struct *tsk, struct cpt_context *ctx) +{ +#ifdef CONFIG_X86_64 + if (!(task_thread_info(tsk)->flags&_TIF_IA32)) { + if (task_pt_regs(tsk)->ip >= VSYSCALL_START && + task_pt_regs(tsk)->ip < VSYSCALL_END) { + eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk)); + return -EAGAIN; + } + } +#endif + return 0; +} + +#ifdef CONFIG_X86 + +static u32 encode_segment(u32 segreg) +{ + segreg &= 0xFFFF; + + if (segreg == 0) + return CPT_SEG_ZERO; + if ((segreg & 3) != 3) { + wprintk("Invalid RPL of a segment reg %x\n", segreg); + return CPT_SEG_ZERO; + } + + /* LDT descriptor, it is just an index to LDT array */ + if (segreg & 4) + return CPT_SEG_LDT + (segreg >> 3); + + /* TLS descriptor. */ + if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN && + (segreg >> 3) <= GDT_ENTRY_TLS_MAX) + return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN); + + /* One of standard desriptors */ +#ifdef CONFIG_X86_64 + if (segreg == __USER32_DS) + return CPT_SEG_USER32_DS; + if (segreg == __USER32_CS) + return CPT_SEG_USER32_CS; + if (segreg == __USER_DS) + return CPT_SEG_USER64_DS; + if (segreg == __USER_CS) + return CPT_SEG_USER64_CS; +#else + if (segreg == __USER_DS) + return CPT_SEG_USER32_DS; + if (segreg == __USER_CS) + return CPT_SEG_USER32_CS; +#endif + wprintk("Invalid segment reg %x\n", segreg); + return CPT_SEG_ZERO; +} + +#ifdef CONFIG_X86_64 +static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s, + struct task_struct *tsk) +{ + d->cpt_ebp = s->bp; + d->cpt_ebx = s->bx; + d->cpt_eax = s->ax; + d->cpt_ecx = s->cx; + d->cpt_edx = s->dx; + d->cpt_esi = s->si; + d->cpt_edi = s->di; + d->cpt_orig_eax = s->orig_ax; + d->cpt_eip = s->ip; + d->cpt_xcs = encode_segment(s->cs); + d->cpt_eflags = s->flags; + d->cpt_esp = s->sp; + d->cpt_xss = encode_segment(s->ss); + d->cpt_xds = encode_segment(tsk->thread.ds); + d->cpt_xes = encode_segment(tsk->thread.es); +} + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + cpt_open_object(NULL, ctx); + + if (task_thread_info(tsk)->flags & _TIF_IA32) { + struct cpt_x86_regs ri; + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[4] = 0; + ri.cpt_debugreg[5] = 0; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + ri.cpt_fs = encode_segment(tsk->thread.fsindex); + ri.cpt_gs = CPT_SEG_ZERO; + ri.cpt_ugs = encode_segment(tsk->thread.gsindex); + + xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk); + + ctx->write(&ri, sizeof(ri), ctx); + } else { + struct cpt_x86_64_regs ri; + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_64_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_fsbase = tsk->thread.fs; + ri.cpt_gsbase = tsk->thread.gs; + ri.cpt_fsindex = encode_segment(tsk->thread.fsindex); + ri.cpt_gsindex = encode_segment(tsk->thread.gsindex); + ri.cpt_ds = encode_segment(tsk->thread.ds); + ri.cpt_es = encode_segment(tsk->thread.es); + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[4] = 0; + ri.cpt_debugreg[5] = 0; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + + memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs)); + + ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs); + ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss); + + ctx->write(&ri, sizeof(ri), ctx); + + } + cpt_close_object(ctx); + + return 0; +} + +#else + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_x86_regs ri; + struct pt_regs *pt_regs; + + cpt_open_object(NULL, ctx); + + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + + pt_regs = task_pt_regs(tsk); + + ri.cpt_fs = encode_segment(pt_regs->fs); + ri.cpt_gs = encode_segment(tsk->thread.gs); + ri.cpt_ugs = encode_segment(task_user_gs(tsk)); + + ri.cpt_ebx = pt_regs->bx; + ri.cpt_ecx = pt_regs->cx; + ri.cpt_edx = pt_regs->dx; + ri.cpt_esi = pt_regs->si; + ri.cpt_edi = pt_regs->di; + ri.cpt_ebp = pt_regs->bp; + ri.cpt_eax = pt_regs->ax; + ri.cpt_xds = pt_regs->ds; + ri.cpt_xes = pt_regs->es; + ri.cpt_orig_eax = pt_regs->orig_ax; + ri.cpt_eip = pt_regs->ip; + ri.cpt_xcs = pt_regs->cs; + ri.cpt_eflags = pt_regs->flags; + ri.cpt_esp = pt_regs->sp; + ri.cpt_xss = pt_regs->ss; + + ri.cpt_xcs = encode_segment(pt_regs->cs); + ri.cpt_xss = encode_segment(pt_regs->ss); + ri.cpt_xds = encode_segment(pt_regs->ds); + ri.cpt_xes = encode_segment(pt_regs->es); + + ctx->write(&ri, sizeof(ri), ctx); + cpt_close_object(ctx); + + return 0; +} +#endif +#endif + +#ifdef CONFIG_IA64 + +/* + PMD? + */ + +#define _C(x) do { if ((err = (x)) < 0) { printk("atm:" CPT_FID #x " %d\n", \ + CPT_TID(tsk), err); return -EINVAL; } } while (0) + +static int ass_to_mouth(struct cpt_ia64_regs *r, struct task_struct *tsk, + struct cpt_context *ctx) +{ + int err; + struct unw_frame_info info; + struct ia64_fpreg fpval; + int i; + + unw_init_from_blocked_task(&info, tsk); + _C(unw_unwind_to_user(&info)); + + /* NAT_BITS */ + do { + unsigned long scratch_unat; + + scratch_unat = info.sw->caller_unat; + if (info.pri_unat_loc) + scratch_unat = *info.pri_unat_loc; + + r->nat[0] = ia64_get_scratch_nat_bits(task_pt_regs(tsk), scratch_unat); + /* Just to be on safe side. */ + r->nat[0] &= 0xFFFFFFFFUL; + } while (0); + + /* R4-R7 */ + for (i = 4; i <= 7; i++) { + char nat = 0; + _C(unw_access_gr(&info, i, &r->gr[i], &nat, 0)); + r->nat[0] |= (nat != 0) << i; + } + + /* B1-B5 */ + for (i = 1; i <= 5; i++) { + _C(unw_access_br(&info, i, &r->br[i], 0)); + } + + /* AR_EC, AR_LC */ + _C(unw_access_ar(&info, UNW_AR_EC, &r->ar_ec, 0)); + _C(unw_access_ar(&info, UNW_AR_LC, &r->ar_lc, 0)); + + /* F2..F5, F16..F31 */ + for (i = 2; i <= 5; i++) { + _C(unw_get_fr(&info, i, &fpval)); + memcpy(&r->fr[i*2], &fpval, 16); + } + for (i = 16; i <= 31; i++) { + _C(unw_get_fr(&info, i, &fpval)); + memcpy(&r->fr[i*2], &fpval, 16); + } + return 0; +} + +#undef _C + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + int err; + unsigned long pg; + struct cpt_ia64_regs *r; + struct ia64_psr *psr; + struct switch_stack *sw; + struct pt_regs *pt; + void *krbs = (void *)tsk + IA64_RBS_OFFSET; + unsigned long reg; + + if (tsk->exit_state) + return 0; + + pt = task_pt_regs(tsk); + + sw = (struct switch_stack *) (tsk->thread.ksp + 16); + + if ((pg = __get_free_page(GFP_KERNEL)) == 0) + return -ENOMEM; + + r = (void*)pg; + /* To catch if we forgot some register */ + memset(r, 0xA5, sizeof(*r)); + + r->gr[0] = 0; + r->fr[0] = r->fr[1] = 0; + r->fr[2] = 0x8000000000000000UL; + r->fr[3] = 0xffff; + + r->nat[0] = r->nat[1] = 0; + + err = ass_to_mouth(r, tsk, ctx); + if (err) { + printk("ass_to_mouth error %d\n", err); + goto out; + } + + /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ + memcpy(&r->gr[1], &pt->r1, 8*(2-1)); + memcpy(&r->gr[2], &pt->r2, 8*(4-2)); + memcpy(&r->gr[8], &pt->r8, 8*(12-8)); + memcpy(&r->gr[12], &pt->r12, 8*(14-12)); + memcpy(&r->gr[14], &pt->r14, 8*(15-14)); + memcpy(&r->gr[15], &pt->r15, 8*(16-15)); + memcpy(&r->gr[16], &pt->r16, 8*(32-16)); + + r->br[0] = pt->b0; + r->br[6] = pt->b6; + r->br[7] = pt->b7; + + r->ar_bspstore = pt->ar_bspstore; + r->ar_unat = pt->ar_unat; + r->ar_pfs = pt->ar_pfs; + r->ar_ccv = pt->ar_ccv; + r->ar_fpsr = pt->ar_fpsr; + r->ar_csd = pt->ar_csd; + r->ar_ssd = pt->ar_ssd; + r->ar_rsc = pt->ar_rsc; + + r->cr_iip = pt->cr_iip; + r->cr_ipsr = pt->cr_ipsr; + + r->pr = pt->pr; + + r->cfm = pt->cr_ifs; + r->ar_rnat = pt->ar_rnat; + + /* fpregs 6..9,10..11 are in pt_regs */ + memcpy(&r->fr[2*6], &pt->f6, 16*(10-6)); + memcpy(&r->fr[2*10], &pt->f10, 16*(12-10)); + /* fpreg 12..15 are on switch stack */ + memcpy(&r->fr[2*12], &sw->f12, 16*(16-12)); + /* fpregs 32...127 */ + psr = ia64_psr(task_pt_regs(tsk)); + preempt_disable(); + if (ia64_is_local_fpu_owner(tsk) && psr->mfh) { + psr->mfh = 0; + tsk->thread.flags |= IA64_THREAD_FPH_VALID; + ia64_save_fpu(&tsk->thread.fph[0]); + } + preempt_enable(); + memcpy(&r->fr[32*2], tsk->thread.fph, 16*(128-32)); + + if (tsk->thread.flags & IA64_THREAD_DBG_VALID) { + memcpy(r->ibr, tsk->thread.ibr, sizeof(r->ibr)); + memcpy(r->dbr, tsk->thread.dbr, sizeof(r->ibr)); + } else { + memset(r->ibr, 0, sizeof(r->ibr)); + memset(r->dbr, 0, sizeof(r->dbr)); + } + + r->loadrs = pt->loadrs; + r->num_regs = ia64_rse_num_regs(krbs, krbs + 8*(pt->loadrs >> 19)); + if ((long)pt->cr_ifs > 0) + r->num_regs += (pt->cr_ifs & 0x7f); + + if (r->num_regs > 96) { + eprintk_ctx(CPT_FID " too much RSE regs %lu\n", + CPT_TID(tsk), r->num_regs); + return -EINVAL; + } + + for (reg = 0; reg < r->num_regs; reg++) { + unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); + unsigned long *rnatp = ia64_rse_rnat_addr(ptr); + + r->gr[32+reg] = *ptr; + + if ((unsigned long)rnatp >= sw->ar_bspstore) + rnatp = &sw->ar_rnat; + if (*rnatp & (1UL<nat[0] |= (1UL<<(reg+32)); + else + r->nat[1] |= (1UL<<(reg-32)); + } + } + if (r->nat[0] | r->nat[1]) + wprintk_ctx(CPT_FID " nat bits %lx%016lx\n", CPT_TID(tsk), + r->nat[1], r->nat[0]); + + cpt_open_object(NULL, ctx); + r->cpt_next = sizeof(*r); + r->cpt_object = CPT_OBJ_IA64_REGS; + r->cpt_hdrlen = sizeof(*r); + r->cpt_content = CPT_CONTENT_VOID; + ctx->write(r, sizeof(*r), ctx); + cpt_close_object(ctx); + err = 0; + +out: + free_page(pg); + return err; +} +#endif + +static int dump_kstack(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_obj_bits hdr; + unsigned long size; + void *start; + + cpt_open_object(NULL, ctx); + +#ifdef CONFIG_X86_64 + size = tsk->thread.sp0 - tsk->thread.sp; + start = (void*)tsk->thread.sp; +#elif defined(CONFIG_X86_32) + size = tsk->thread.sp0 - tsk->thread.sp; + start = (void*)tsk->thread.sp; +#elif defined(CONFIG_IA64) + size = (unsigned long)(task_pt_regs(tsk)+1) - tsk->thread.ksp; + start = (void*)tsk->thread.ksp; +#else +#error Arch is not supported +#endif + + hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); + hdr.cpt_object = CPT_OBJ_BITS; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = CPT_CONTENT_STACK; + hdr.cpt_size = size; + + ctx->write(&hdr, sizeof(hdr), ctx); + ctx->write(start, size, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} + +#ifdef CONFIG_X86 +/* Formats of i387_fxsave_struct are the same for x86_64 + * and i386. Plain luck. */ + +static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_obj_bits hdr; + unsigned long size; + int type; + + if (!tsk->thread.xstate) + return 0; + + cpt_open_object(NULL, ctx); + + type = CPT_CONTENT_X86_FPUSTATE; + size = sizeof(struct i387_fxsave_struct); +#ifndef CONFIG_X86_64 + if (!cpu_has_fxsr) { + size = sizeof(struct i387_fsave_struct); + type = CPT_CONTENT_X86_FPUSTATE_OLD; + } +#endif + + hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); + hdr.cpt_object = CPT_OBJ_BITS; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = type; + hdr.cpt_size = size; + + ctx->write(&hdr, sizeof(hdr), ctx); + ctx->write(tsk->thread.xstate, size, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} +#endif + +#ifdef CONFIG_IA64 + +static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) +{ + return 0; +} +#endif + +static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info) +{ + si->cpt_signo = info->si_signo; + si->cpt_errno = info->si_errno; + si->cpt_code = info->si_code; + + switch(si->cpt_code & __SI_MASK) { + case __SI_TIMER: + si->cpt_pid = info->si_tid; + si->cpt_uid = info->si_overrun; + si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr); + si->cpt_utime = info->si_sys_private; + break; + case __SI_POLL: + si->cpt_pid = info->si_band; + si->cpt_uid = info->si_fd; + break; + case __SI_FAULT: + si->cpt_sigval = cpt_ptr_export(info->si_addr); +#ifdef __ARCH_SI_TRAPNO + si->cpt_pid = info->si_trapno; +#endif + break; + case __SI_CHLD: + si->cpt_pid = info->si_pid; + si->cpt_uid = info->si_uid; + si->cpt_sigval = info->si_status; + si->cpt_stime = info->si_stime; + si->cpt_utime = info->si_utime; + break; + case __SI_KILL: + case __SI_RT: + case __SI_MESGQ: + default: + si->cpt_pid = info->si_pid; + si->cpt_uid = info->si_uid; + si->cpt_sigval = cpt_ptr_export(info->si_ptr); + break; + } + return 0; +} + +static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx) +{ + struct sigqueue *q; + loff_t saved_obj; + + if (list_empty(&list->list)) + return 0; + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(q, &list->list, list) { + struct cpt_siginfo_image si; + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_SIGINFO; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + si.cpt_qflags = q->flags; + si.cpt_user = q->user->uid; + + if (encode_siginfo(&si, &q->info)) + return -EINVAL; + + ctx->write(&si, sizeof(si), ctx); + } + cpt_pop_object(&saved_obj, ctx); + return 0; +} + + + +static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct signal_struct *sig = obj->o_obj; + struct cpt_signal_image *v = cpt_get_buf(ctx); + struct task_struct *tsk; + int i; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SIGNAL_STRUCT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_pgrp_type = CPT_PGRP_NORMAL; + v->cpt_pgrp = 0; + +#if 0 /* the code below seems to be unneeded */ + if (sig->__pgrp <= 0) { + eprintk_ctx("bad pgid\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + + read_lock(&tasklist_lock); + tsk = find_task_by_pid_ns(sig->__pgrp, &init_pid_ns); + if (tsk == NULL) + v->cpt_pgrp_type = CPT_PGRP_ORPHAN; + read_unlock(&tasklist_lock); + v->cpt_pgrp = pid_to_vpid(sig->__pgrp); +#endif + + v->cpt_old_pgrp = 0; +/* if (!sig->tty_old_pgrp) { + eprintk_ctx("bad tty_old_pgrp\n"); + cpt_release_buf(ctx); + return -EINVAL; + }*/ + if (sig->tty_old_pgrp) { + v->cpt_old_pgrp_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PID); + if (tsk == NULL) { + v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN; + tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PGID); + } + read_unlock(&tasklist_lock); + if (tsk == NULL) { + eprintk_ctx("tty_old_pgrp does not exist anymore\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_old_pgrp = pid_vnr(sig->tty_old_pgrp); + if ((int)v->cpt_old_pgrp < 0) { + dprintk_ctx("stray tty_old_pgrp %d\n", pid_nr(sig->tty_old_pgrp)); + v->cpt_old_pgrp = -1; + v->cpt_old_pgrp_type = CPT_PGRP_STRAY; + } + } + + v->cpt_session_type = CPT_PGRP_NORMAL; + v->cpt_session = 0; + +#if 0 /* the code below seems to be unneeded */ + if (sig->__session <= 0) { + eprintk_ctx("bad session\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + read_lock(&tasklist_lock); + tsk = find_task_by_pid_ns(sig->__session, &init_pid_ns); + if (tsk == NULL) + v->cpt_session_type = CPT_PGRP_ORPHAN; + read_unlock(&tasklist_lock); + v->cpt_session = pid_to_vpid(sig->__session); +#endif + + v->cpt_leader = sig->leader; + v->cpt_ctty = CPT_NULL; + if (sig->tty) { + cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx); + if (cobj) + v->cpt_ctty = cobj->o_pos; + else { + eprintk_ctx("controlling tty is not found\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + } + memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8); + + v->cpt_curr_target = 0; + if (sig->curr_target) + v->cpt_curr_target = task_pid_vnr(sig->curr_target); + v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0); + v->cpt_group_exit_code = sig->group_exit_code; + v->cpt_group_exit_task = 0; + if (sig->group_exit_task) + v->cpt_group_exit_task = task_pid_vnr(sig->group_exit_task); + v->cpt_notify_count = sig->notify_count; + v->cpt_group_stop_count = sig->group_stop_count; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8) + v->cpt_utime = sig->utime; + v->cpt_stime = sig->stime; + v->cpt_cutime = sig->cutime; + v->cpt_cstime = sig->cstime; + v->cpt_nvcsw = sig->nvcsw; + v->cpt_nivcsw = sig->nivcsw; + v->cpt_cnvcsw = sig->cnvcsw; + v->cpt_cnivcsw = sig->cnivcsw; + v->cpt_min_flt = sig->min_flt; + v->cpt_maj_flt = sig->maj_flt; + v->cpt_cmin_flt = sig->cmin_flt; + v->cpt_cmaj_flt = sig->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = sig->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = sig->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#endif + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + dump_sigqueue(&sig->shared_pending, ctx); + + cpt_close_object(ctx); + return 0; +} + +int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx) +{ +#ifdef CONFIG_KEYS + if (tsk->cred->request_key_auth || tsk->cred->thread_keyring) { + eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif +#ifdef CONFIG_NUMA + if (tsk->mempolicy) { + eprintk_ctx("NUMA mempolicy is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif +#ifdef CONFIG_TUX + if (tsk->tux_info) { + eprintk_ctx("TUX is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif + return 0; +} + +static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + const struct cred *cred; + int last_thread; + struct cpt_task_image *v = cpt_get_buf(ctx); + cpt_object_t *tobj; + cpt_object_t *tg_obj; + loff_t saved_obj; + int i; + int err; + struct timespec delta; + struct mm_struct * tsk_mm; + struct files_struct * tsk_files; + struct fs_struct * tsk_fs; + struct mnt_namespace * tsk_ns; + + cpt_open_object(obj, ctx); + + v->cpt_signal = CPT_NULL; + tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx); + if (!tg_obj) BUG(); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_TASK; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_state = tsk->state; + if (tsk->state == EXIT_ZOMBIE) { + eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } else if (tsk->state == EXIT_DEAD) { + if (tsk->exit_state != EXIT_DEAD && + tsk->exit_state != EXIT_ZOMBIE) { + eprintk_ctx("invalid exit_state %d on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if (tsk->exit_state) { + v->cpt_state = tsk->exit_state; + if (tsk->state != TASK_DEAD) { + eprintk_ctx("invalid tsk->state %ld/%d on" CPT_FID "\n", + tsk->state, tsk->exit_state, CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if (cpt_check_unsupported(tsk, ctx)) { + cpt_release_buf(ctx); + return -EBUSY; + } + + v->cpt_flags = tsk->flags & CPT_TASK_FLAGS_MASK; + v->cpt_ptrace = tsk->ptrace; + v->cpt_prio = tsk->prio; + v->cpt_exit_code = tsk->exit_code; + v->cpt_exit_signal = tsk->exit_signal; + v->cpt_pdeath_signal = tsk->pdeath_signal; + v->cpt_static_prio = tsk->static_prio; + v->cpt_rt_priority = tsk->rt_priority; + v->cpt_policy = tsk->policy; + if (v->cpt_policy != SCHED_NORMAL) { + eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + cpt_release_buf(ctx); + return -EINVAL; + } + + /* Unpleasant moment. When leader of thread group exits, + * it remains in zombie state until all the group exits. + * We save not-NULL pointers to process mm/files/fs, so + * that we can restore this thread group. + */ + tsk_mm = tsk->mm; + tsk_files = tsk->files; + tsk_fs = tsk->fs; + tsk_ns = tsk->nsproxy ? tsk->nsproxy->mnt_ns : NULL; + + if (tsk->exit_state && !thread_group_empty(tsk) && + thread_group_leader(tsk)) { + struct task_struct * p = tsk; + + read_lock(&tasklist_lock); + do { + if (p->mm) + tsk_mm = p->mm; + if (p->files) + tsk_files = p->files; + if (p->fs) + tsk_fs = p->fs; + if (p->nsproxy && p->nsproxy->mnt_ns) + tsk_ns = p->nsproxy->mnt_ns; + p = next_thread(p); + } while (p != tsk); + read_unlock(&tasklist_lock); + } + + v->cpt_mm = CPT_NULL; + if (tsk_mm) { + tobj = lookup_cpt_object(CPT_OBJ_MM, tsk_mm, ctx); + if (!tobj) BUG(); + v->cpt_mm = tobj->o_pos; + } + v->cpt_files = CPT_NULL; + if (tsk_files) { + tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk_files, ctx); + if (!tobj) BUG(); + v->cpt_files = tobj->o_pos; + } + v->cpt_fs = CPT_NULL; + if (tsk_fs) { + tobj = lookup_cpt_object(CPT_OBJ_FS, tsk_fs, ctx); + if (!tobj) BUG(); + v->cpt_fs = tobj->o_pos; + } + v->cpt_namespace = CPT_NULL; + if (tsk_ns) { + tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk_ns, ctx); + if (!tobj) BUG(); + v->cpt_namespace = tobj->o_pos; + + if (tsk_ns != current->nsproxy->mnt_ns) + eprintk_ctx("namespaces are not supported:" + "process " CPT_FID "\n", CPT_TID(tsk)); + } + v->cpt_sysvsem_undo = CPT_NULL; + if (tsk->sysvsem.undo_list && !tsk->exit_state) { + tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx); + if (!tobj) BUG(); + v->cpt_sysvsem_undo = tobj->o_pos; + } + v->cpt_sighand = CPT_NULL; + if (tsk->sighand) { + tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx); + if (!tobj) BUG(); + v->cpt_sighand = tobj->o_pos; + } + v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked); + v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked); + v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask); + + v->cpt_pid = task_pid_vnr(tsk); + v->cpt_tgid = task_tgid_vnr(tsk); + v->cpt_ppid = 0; + if (tsk->parent) { + if (tsk->parent != tsk->real_parent && + !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) { + eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, task_pid_vnr(tsk), tsk->comm); + cpt_release_buf(ctx); + return -EBUSY; + } + v->cpt_ppid = task_pid_vnr(tsk->parent); + } + v->cpt_rppid = tsk->real_parent ? task_pid_vnr(tsk->real_parent) : 0; + v->cpt_pgrp = task_pgrp_vnr(tsk); + v->cpt_session = task_session_vnr(tsk); + v->cpt_old_pgrp = 0; + if (tsk->signal->tty_old_pgrp) + v->cpt_old_pgrp = pid_vnr(tsk->signal->tty_old_pgrp); + v->cpt_leader = tsk->group_leader ? task_pid_vnr(tsk->group_leader) : 0; + v->cpt_set_tid = (unsigned long)tsk->set_child_tid; + v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid; + memcpy(v->cpt_comm, tsk->comm, 16); + + cred = tsk->cred; + v->cpt_user = cred->user->uid; + v->cpt_uid = cred->uid; + v->cpt_euid = cred->euid; + v->cpt_suid = cred->suid; + v->cpt_fsuid = cred->fsuid; + v->cpt_gid = cred->gid; + v->cpt_egid = cred->egid; + v->cpt_sgid = cred->sgid; + v->cpt_fsgid = cred->fsgid; + v->cpt_ngids = 0; + if (cred->group_info && cred->group_info->ngroups != 0) { + int i = cred->group_info->ngroups; + if (i > 32) { + /* Shame... I did a simplified version and _forgot_ + * about this. Later, later. */ + eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + v->cpt_ngids = i; + for (i--; i>=0; i--) + v->cpt_gids[i] = cred->group_info->small_block[i]; + } + v->cpt_prctl_uac = 0; + v->cpt_prctl_fpemu = 0; + v->__cpt_pad1 = 0; +#ifdef CONFIG_IA64 + v->cpt_prctl_uac = (tsk->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT; + v->cpt_prctl_fpemu = (tsk->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT; +#endif + memcpy(&v->cpt_ecap, &cred->cap_effective, 8); + memcpy(&v->cpt_icap, &cred->cap_inheritable, 8); + memcpy(&v->cpt_pcap, &cred->cap_permitted, 8); + v->cpt_keepcap = cred->securebits; + + v->cpt_did_exec = tsk->did_exec; + v->cpt_exec_domain = -1; + v->cpt_thrflags = task_thread_info(tsk)->flags & ~(1<cpt_64bit = 0; +#ifdef CONFIG_X86_64 + /* Clear x86_64 specific flags */ + v->cpt_thrflags &= ~(_TIF_FORK|_TIF_IA32); + if (!(task_thread_info(tsk)->flags & _TIF_IA32)) { + ctx->tasks64++; + v->cpt_64bit = 1; + } +#endif +#ifdef CONFIG_IA64 + /* Clear ia64 specific flags */ + //// v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); + if (!IS_IA32_PROCESS(task_pt_regs(tsk))) { + ctx->tasks64++; + v->cpt_64bit = 1; + } +#endif + v->cpt_thrstatus = task_thread_info(tsk)->status; + v->cpt_addr_limit = -1; + + v->cpt_personality = tsk->personality; + +#ifdef CONFIG_X86 + for (i=0; i=3) { + eprintk_ctx("too many tls descs\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a; + } +#endif + + v->cpt_restart.fn = CPT_RBL_0; + if (task_thread_info(tsk)->restart_block.fn != task_thread_info(current)->restart_block.fn) { + struct restart_block *rb = &task_thread_info(tsk)->restart_block; + ktime_t e; + + if (rb->fn == hrtimer_nanosleep_restart) { + v->cpt_restart.fn = CPT_RBL_NANOSLEEP; + + e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + if (rb->fn == compat_nanosleep_restart) { + v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP; + + e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } +#endif + if (rb->fn == do_restart_poll) { + u64 timeout_jiffies; + + timeout_jiffies = ((u64)rb->arg3 << 32)|(u64)rb->arg2; + e.tv64 = timeout_jiffies * TICK_NSEC; + + v->cpt_restart.fn = CPT_RBL_POLL; + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } + if (rb->fn == futex_wait_restart) { + v->cpt_restart.fn = CPT_RBL_FUTEX_WAIT; + + e.tv64 = rb->futex.time; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = (unsigned long)rb->futex.uaddr; + v->cpt_restart.arg1 = rb->futex.val; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = rb->futex.flags; + goto continue_dump; + } + eprintk_ctx("unknown restart block %p\n", rb->fn); + return -EINVAL; + } + +continue_dump: + v->cpt_it_real_incr = 0; + v->cpt_it_prof_incr = 0; + v->cpt_it_virt_incr = 0; + v->cpt_it_real_value = 0; + v->cpt_it_prof_value = 0; + v->cpt_it_virt_value = 0; + if (thread_group_leader(tsk) && tsk->exit_state == 0) { + ktime_t rem; + + v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr); + v->cpt_it_prof_incr = tsk->signal->it[CPUCLOCK_PROF].incr; + v->cpt_it_virt_incr = tsk->signal->it[CPUCLOCK_VIRT].incr; + + rem = hrtimer_get_remaining(&tsk->signal->real_timer); + + if (hrtimer_active(&tsk->signal->real_timer)) { + if (rem.tv64 <= 0) + rem.tv64 = NSEC_PER_USEC; + v->cpt_it_real_value = ktime_to_ns(rem); + dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_it_real_value); + } + v->cpt_it_prof_value = tsk->signal->it[CPUCLOCK_PROF].expires; + v->cpt_it_virt_value = tsk->signal->it[CPUCLOCK_VIRT].expires; + } + v->cpt_used_math = (tsk_used_math(tsk) != 0); + + if (tsk->notifier) { + eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + cpt_release_buf(ctx); + return -EINVAL; + } + + v->cpt_utime = tsk->utime; + v->cpt_stime = tsk->stime; + delta = tsk->start_time; + _set_normalized_timespec(&delta, + delta.tv_sec - get_exec_env()->start_timespec.tv_sec, + delta.tv_nsec - get_exec_env()->start_timespec.tv_nsec); + v->cpt_starttime = cpt_timespec_export(&delta); + v->cpt_nvcsw = tsk->nvcsw; + v->cpt_nivcsw = tsk->nivcsw; + v->cpt_min_flt = tsk->min_flt; + v->cpt_maj_flt = tsk->maj_flt; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) + v->cpt_cutime = tsk->cutime; + v->cpt_cstime = tsk->cstime; + v->cpt_cnvcsw = tsk->cnvcsw; + v->cpt_cnivcsw = tsk->cnivcsw; + v->cpt_cmin_flt = tsk->cmin_flt; + v->cpt_cmaj_flt = tsk->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = tsk->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#else + v->cpt_cutime = tsk->signal->cutime; + v->cpt_cstime = tsk->signal->cstime; + v->cpt_cnvcsw = tsk->signal->cnvcsw; + v->cpt_cnivcsw = tsk->signal->cnivcsw; + v->cpt_cmin_flt = tsk->signal->cmin_flt; + v->cpt_cmaj_flt = tsk->signal->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#endif + +#ifdef CONFIG_BEANCOUNTERS + if (tsk->mm) + v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx); + else + v->cpt_mm_ub = CPT_NULL; + v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx); + v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx); + v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx); +#endif + + v->cpt_ptrace_message = tsk->ptrace_message; + v->cpt_pn_state = tsk->pn_state; + v->cpt_stopped_state = tsk->stopped_state; + v->cpt_sigsuspend_state = 0; + +#ifdef CONFIG_X86_32 + if (tsk->thread.vm86_info) { + eprintk_ctx("vm86 task is running\n"); + cpt_release_buf(ctx); + return -EBUSY; + } +#endif + + v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + dump_kstack(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_push_object(&saved_obj, ctx); + err = dump_registers(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + if (err) + return err; + + if (tsk_used_math(tsk)) { + cpt_push_object(&saved_obj, ctx); + dump_fpustate(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->last_siginfo) { + struct cpt_siginfo_image si; + cpt_push_object(&saved_obj, ctx); + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_LASTSIGINFO; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + if (encode_siginfo(&si, tsk->last_siginfo)) + return -EINVAL; + + ctx->write(&si, sizeof(si), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->sas_ss_size) { + struct cpt_sigaltstack_image si; + cpt_push_object(&saved_obj, ctx); + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_SIGALTSTACK; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + si.cpt_stack = tsk->sas_ss_sp; + si.cpt_stacksize = tsk->sas_ss_size; + + ctx->write(&si, sizeof(si), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->robust_list +#ifdef CONFIG_COMPAT + || tsk->compat_robust_list +#endif + ) { + struct cpt_task_aux_image ai; + cpt_push_object(&saved_obj, ctx); + + ai.cpt_next = sizeof(ai); + ai.cpt_object = CPT_OBJ_TASK_AUX; + ai.cpt_hdrlen = sizeof(ai); + ai.cpt_content = CPT_CONTENT_VOID; + + ai.cpt_robust_list = (unsigned long)tsk->robust_list; +#ifdef CONFIG_X86_64 +#ifdef CONFIG_COMPAT + if (task_thread_info(tsk)->flags & _TIF_IA32) + ai.cpt_robust_list = (unsigned long)tsk->compat_robust_list; +#endif +#endif + ctx->write(&ai, sizeof(ai), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + dump_sigqueue(&tsk->pending, ctx); + + last_thread = 1; + read_lock(&tasklist_lock); + do { + struct task_struct * next = next_thread(tsk); + if (next != tsk && !thread_group_leader(next)) + last_thread = 0; + } while (0); + read_unlock(&tasklist_lock); + + if (last_thread) { + struct task_struct *prev_tsk; + int err; + loff_t pos = ctx->file->f_pos; + + cpt_push_object(&saved_obj, ctx); + err = dump_one_signal_struct(tg_obj, ctx); + cpt_pop_object(&saved_obj, ctx); + if (err) + return err; + + prev_tsk = tsk; + for (;;) { + if (prev_tsk->tgid == tsk->tgid) { + loff_t tg_pos; + + tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal); + ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos); + if (thread_group_leader(prev_tsk)) + break; + } + + if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) { + eprintk_ctx("bug: thread group leader is lost\n"); + return -EINVAL; + } + + obj = list_entry(obj->o_list.prev, cpt_object_t, o_list); + prev_tsk = obj->o_obj; + } + } + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_tasks(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_TASKS); + + for_each_object(obj, CPT_OBJ_TASK) { + int err; + + if ((err = dump_one_process(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +int cpt_collect_signals(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + /* Collect process fd sets */ + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) { + eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, task_pid_vnr(tsk), tsk->comm); + return -EBUSY; + } + if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL) + return -ENOMEM; + if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL) + return -ENOMEM; + } + return 0; +} + + +static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct sighand_struct *sig = obj->o_obj; + struct cpt_sighand_image *v = cpt_get_buf(ctx); + int i; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SIGHAND_STRUCT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + for (i=0; i< _NSIG; i++) { + if (sig->action[i].sa.sa_handler != SIG_DFL || + sig->action[i].sa.sa_flags) { + loff_t saved_obj; + struct cpt_sighandler_image *o = cpt_get_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + o->cpt_next = CPT_NULL; + o->cpt_object = CPT_OBJ_SIGHANDLER; + o->cpt_hdrlen = sizeof(*o); + o->cpt_content = CPT_CONTENT_VOID; + + o->cpt_signo = i; + o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler; + o->cpt_restorer = 0; +#ifdef CONFIG_X86 + o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer; +#endif + o->cpt_flags = sig->action[i].sa.sa_flags; + memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8); + ctx->write(o, sizeof(*o), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + } + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_sighand(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT); + + for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) { + int err; + + if ((err = dump_one_sighand_struct(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_process.h linux-2.6.32.48-openvz/kernel/cpt/cpt_process.h --- linux-2.6.32.48/kernel/cpt/cpt_process.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_process.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,13 @@ +int cpt_collect_signals(cpt_context_t *); +int cpt_dump_signal(struct cpt_context *); +int cpt_dump_sighand(struct cpt_context *); +int cpt_dump_tasks(struct cpt_context *); + +int rst_signal_complete(struct cpt_task_image *ti, int *exiting, struct cpt_context *ctx); +__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx); + +int rst_restore_process(struct cpt_context *ctx); +int rst_process_linkage(struct cpt_context *ctx); + +int check_task_state(struct task_struct *tsk, struct cpt_context *ctx); +struct pid *alloc_vpid_safe(pid_t vnr); diff -urNp linux-2.6.32.48/kernel/cpt/cpt_socket.c linux-2.6.32.48-openvz/kernel/cpt/cpt_socket.c --- linux-2.6.32.48/kernel/cpt/cpt_socket.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_socket.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,802 @@ +/* + * + * kernel/cpt/cpt_socket.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_files.h" +#include "cpt_kernel.h" + +static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx); + + +/* Sockets are quite different of another kinds of files. + * There is one simplification: only one struct file can refer to a socket, + * so we could store information about socket directly in section FILES as + * a description of a file and append f.e. array of not-yet-accepted + * connections of listening socket as array of auxiliary data. + * + * Complications are: + * 1. TCP sockets can be orphans. We have to relocate orphans as well, + * so we have to create special section for orphans. + * 2. AF_UNIX sockets are distinguished objects: set of links between + * AF_UNIX sockets is quite arbitrary. + * A. Each socket can refers to many of files due to FD passing. + * B. Each socket except for connected ones can have in queue skbs + * sent by any of sockets. + * + * 2A is relatively easy: after our tasks are frozen we make an additional + * recursive pass throgh set of collected files and get referenced to + * FD passed files. After end of recursion, all the files are treated + * in the same way. All they will be stored in section FILES. + * + * 2B. We have to resolve all those references at some point. + * It is the place where pipe-like approach to image fails. + * + * All this makes socket checkpointing quite chumbersome. + * Right now we collect all the sockets and assign some numeric index value + * to each of them. The socket section is separate and put after section FILES, + * so section FILES refers to sockets by index, section SOCKET refers to FILES + * as usual by position in image. All the refs inside socket section are + * by index. When restoring we read socket section, create objects to hold + * mappings index <-> pos. At the second pass we open sockets (simultaneosly + * with their pairs) and create FILE objects. + */ + + +/* ====== FD passing ====== */ + +/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we + * have to implement this. A problem is that in general case we receive + * skbs from an unknown context, so new files can arrive to checkpointed + * set of processes even after they are stopped. Well, we are going just + * to ignore unknown fds while doing real checkpointing. It is fair because + * links outside checkpointed set are going to fail anyway. + * + * ATTN: the procedure is recursive. We linearize the recursion adding + * newly found files to the end of file list, so they will be analyzed + * in the same loop. + */ + +static int collect_one_passedfd(struct file *file, cpt_context_t * ctx) +{ + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock; + struct sock *sk; + struct sk_buff *skb; + + if (!S_ISSOCK(inode->i_mode)) + return -ENOTSOCK; + + sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; + + if (sock->ops->family != AF_UNIX) + return 0; + + sk = sock->sk; + + /* Subtle locking issue. skbs cannot be removed while + * we are scanning, because all the processes are stopped. + * They still can be added to tail of queue. Locking while + * we dereference skb->next is enough to resolve this. + * See above about collision with skbs added after we started + * checkpointing. + */ + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + if (UNIXCB(skb).fp && skb->sk && + (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + int i; + + for (i = fpl->count-1; i >= 0; i--) { + if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL) + return -ENOMEM; + } + } + + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + + return 0; +} + +int cpt_collect_passedfds(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { + int err; + + if ((err = collect_one_passedfd(file, ctx)) < 0) + return err; + } + } + + return 0; +} + +/* ====== End of FD passing ====== */ + +/* Must be called under bh_lock_sock() */ + +void clear_backlog(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_backlog.head; + + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + while (skb) { + struct sk_buff *next = skb->next; + + skb->next = NULL; + kfree_skb(skb); + skb = next; + } +} + +void release_sock_nobacklog(struct sock *sk) +{ + spin_lock_bh(&(sk->sk_lock.slock)); + clear_backlog(sk); + sk->sk_lock.owned = 0; + if (waitqueue_active(&(sk->sk_lock.wq))) + wake_up(&(sk->sk_lock.wq)); + spin_unlock_bh(&(sk->sk_lock.slock)); +} + +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, + struct sock *sk, struct cpt_context *ctx) +{ + struct cpt_skb_image *v = cpt_get_buf(ctx); + loff_t saved_obj; + struct timeval tmptv; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SKB; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_owner = owner; + v->cpt_queue = type; + skb_get_timestamp(skb, &tmptv); + v->cpt_stamp = cpt_timeval_export(&tmptv); + v->cpt_hspace = skb->data - skb->head; + v->cpt_tspace = skb->end - skb->tail; + v->cpt_h = skb_transport_header(skb) - skb->head; + v->cpt_nh = skb_network_header(skb) - skb->head; + v->cpt_mac = skb_mac_header(skb) - skb->head; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v->cpt_cb)); + memset(v->cpt_cb, 0, sizeof(v->cpt_cb)); +#if !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) + if (sk->sk_protocol == IPPROTO_TCP) { + /* Save control block according to tcp_skb_cb with IPv6 */ + BUG_ON(sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm) > + sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm)); + memcpy(v->cpt_cb, skb->cb, sizeof(struct inet_skb_parm)); + memcpy((void *)v->cpt_cb + sizeof(struct inet6_skb_parm), + skb->cb + sizeof(struct inet_skb_parm), + sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm)); + } else +#endif + memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); + if (sizeof(skb->cb) > sizeof(v->cpt_cb)) { + int i; + for (i=sizeof(v->cpt_cb); icb); i++) { + if (skb->cb[i]) { + wprintk_ctx("dirty skb cb"); + break; + } + } + } + v->cpt_len = skb->len; + v->cpt_mac_len = skb->mac_len; + v->cpt_csum = skb->csum; + v->cpt_local_df = skb->local_df; + v->cpt_pkt_type = skb->pkt_type; + v->cpt_ip_summed = skb->ip_summed; + v->cpt_priority = skb->priority; + v->cpt_protocol = skb->protocol; + v->cpt_security = 0; + v->cpt_gso_segs = skb_shinfo(skb)->gso_segs; + v->cpt_gso_size = skb_shinfo(skb)->gso_size; + if (skb_shinfo(skb)->gso_type) { + eprintk_ctx("skb ufo is not supported\n"); + return -EINVAL; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (skb->len + (skb->data - skb->head) > 0) { + struct cpt_obj_bits ob; + loff_t saved_obj2; + + cpt_push_object(&saved_obj2, ctx); + cpt_open_object(NULL, ctx); + ob.cpt_next = CPT_NULL; + ob.cpt_object = CPT_OBJ_BITS; + ob.cpt_hdrlen = sizeof(ob); + ob.cpt_content = CPT_CONTENT_DATA; + ob.cpt_size = skb->len + v->cpt_hspace; + + ctx->write(&ob, sizeof(ob), ctx); + + ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx); + if (skb->data_len) { + int offset = skb->len - skb->data_len; + while (offset < skb->len) { + int copy = skb->len - offset; + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy)) + BUG(); + ctx->write(ctx->tmpbuf, copy, ctx); + __cpt_release_buf(ctx); + offset += copy; + } + } + + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj2, ctx); + } + + if (skb->sk && skb->sk->sk_family == AF_UNIX) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + + if (fpl) { + int i; + + for (i = 0; i < fpl->count; i++) { + struct cpt_fd_image v; + cpt_object_t *obj; + loff_t saved_obj2; + + obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx); + + if (!obj) { + eprintk_ctx("lost passed FD\n"); + return -EINVAL; + } + + cpt_push_object(&saved_obj2, ctx); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_FILEDESC; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_fd = i; + v.cpt_file = obj->o_pos; + v.cpt_flags = 0; + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj2, ctx); + } + } + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + return 0; +} + +static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + struct sock *sk_cache = NULL; + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + int err; + + if (sk->sk_family == AF_UNIX) { + cpt_object_t *obj; + if (skb->sk != sk_cache) { + idx = -1; + sk_cache = NULL; + obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx); + if (obj) { + idx = obj->o_index; + sk_cache = skb->sk; + } else if (unix_peer(sk) != skb->sk) + goto next_skb; + } + } + + err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, sk, ctx); + if (err) + return err; + +next_skb: + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + return 0; +} + +static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + + skb = skb_peek(&sk->sk_write_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) { + int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, sk, ctx); + if (err) + return err; + + spin_lock_irq(&sk->sk_write_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_write_queue.lock); + } + return 0; +} + +void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx) +{ + loff_t saved_obj; + if (sk->sk_filter) { + struct cpt_obj_bits v; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SKFILTER; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_DATA; + v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter); + + ctx->write(&v, sizeof(v), ctx); + ctx->write(sk->sk_filter->insns, v.cpt_size, ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + cpt_push_object(&saved_obj, ctx); + cpt_dump_mcfilter(sk, ctx); + cpt_pop_object(&saved_obj, ctx); + } +} + +/* Dump socket content */ + +int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx) +{ + struct cpt_sock_image *v = cpt_get_buf(ctx); + struct socket *sock; + struct timeval tmptv; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SOCKET; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_file = CPT_NULL; + sock = sk->sk_socket; + if (sock && sock->file) { + cpt_object_t *tobj; + tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx); + if (tobj) + v->cpt_file = tobj->o_pos; + } + v->cpt_index = index; + v->cpt_parent = parent; + + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + if (sock && !obj->o_lock) { + lockdep_off(); + lock_sock(sk); + lockdep_on(); + obj->o_lock = 1; + } + } + + /* Some bits stored in inode */ + v->cpt_ssflags = sock ? sock->flags : 0; + v->cpt_sstate = sock ? sock->state : 0; + v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0; + + /* Common data */ + v->cpt_family = sk->sk_family; + v->cpt_type = sk->sk_type; + v->cpt_state = sk->sk_state; + v->cpt_reuse = sk->sk_reuse; + v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED); + v->cpt_shutdown = sk->sk_shutdown; + v->cpt_userlocks = sk->sk_userlocks; + v->cpt_no_check = sk->sk_no_check; + v->cpt_zapped = sock_flag(sk, SOCK_DBG); + v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP); + v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE); + v->cpt_protocol = sk->sk_protocol; + v->cpt_err = sk->sk_err; + v->cpt_err_soft = sk->sk_err_soft; + v->cpt_max_ack_backlog = sk->sk_max_ack_backlog; + v->cpt_priority = sk->sk_priority; + v->cpt_rcvlowat = sk->sk_rcvlowat; + v->cpt_rcvtimeo = CPT_NULL; + if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT) + v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo; + v->cpt_sndtimeo = CPT_NULL; + if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT) + v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo; + v->cpt_rcvbuf = sk->sk_rcvbuf; + v->cpt_sndbuf = sk->sk_sndbuf; + v->cpt_bound_dev_if = sk->sk_bound_dev_if; + v->cpt_flags = sk->sk_flags; + v->cpt_lingertime = CPT_NULL; + if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT) + v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime; + v->cpt_peer_pid = sk->sk_peercred.pid; + v->cpt_peer_uid = sk->sk_peercred.uid; + v->cpt_peer_gid = sk->sk_peercred.gid; + tmptv = ktime_to_timeval(sk->sk_stamp); + v->cpt_stamp = cpt_timeval_export(&tmptv); + + v->cpt_peer = -1; + v->cpt_socketpair = 0; + v->cpt_deleted = 0; + + v->cpt_laddrlen = 0; + if (sock) { + int alen = sizeof(v->cpt_laddr); + int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0); + if (err) { + cpt_release_buf(ctx); + return err; + } + v->cpt_laddrlen = alen; + } + v->cpt_raddrlen = 0; + if (sock) { + int alen = sizeof(v->cpt_raddr); + int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2); + if (!err) + v->cpt_raddrlen = alen; + } + + if (sk->sk_family == AF_UNIX) { + if (unix_sk(sk)->dentry) { + struct dentry *d = unix_sk(sk)->dentry; + v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d); + if (!v->cpt_deleted) { + int err = 0; + char *path; + struct path p; + unsigned long pg = __get_free_page(GFP_KERNEL); + + if (!pg) { + cpt_release_buf(ctx); + return -ENOMEM; + } + + p.dentry = d; + p.mnt = unix_sk(sk)->mnt; + path = d_path(&p, (char *)pg, PAGE_SIZE); + + if (!IS_ERR(path)) { + int len = strlen(path); + if (len < 126) { + strcpy(((char*)v->cpt_laddr)+2, path); + v->cpt_laddrlen = len + 2; + } else { + wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2); + } + err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, 1, ctx); + } else { + eprintk_ctx("cannot get path of an af_unix socket\n"); + err = PTR_ERR(path); + } + free_page(pg); + if (err) { + cpt_release_buf(ctx); + return err; + } + } + } + + /* If the socket is connected, find its peer. If peer is not + * in our table, the socket is connected to external process + * and we consider it disconnected. + */ + if (unix_peer(sk)) { + cpt_object_t *pobj; + pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx); + if (pobj) + v->cpt_peer = pobj->o_index; + else + v->cpt_shutdown = SHUTDOWN_MASK; + + if (unix_peer(unix_peer(sk)) == sk) + v->cpt_socketpair = 1; + } + + /* If the socket shares address with another socket it is + * child of some listening socket. Find and record it. */ + if (unix_sk(sk)->addr && + atomic_read(&unix_sk(sk)->addr->refcnt) > 1 && + sk->sk_state != TCP_LISTEN) { + cpt_object_t *pobj; + for_each_object(pobj, CPT_OBJ_SOCKET) { + struct sock *psk = pobj->o_obj; + if (psk->sk_family == AF_UNIX && + psk->sk_state == TCP_LISTEN && + unix_sk(psk)->addr == unix_sk(sk)->addr) { + v->cpt_parent = pobj->o_index; + break; + } + } + } + } + + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + cpt_dump_socket_in(v, sk, ctx); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_dump_sock_attr(sk, ctx); + + dump_rqueue(index, sk, ctx); + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + dump_wqueue(index, sk, ctx); + cpt_dump_ofo_queue(index, sk, ctx); + } + + if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + && sk->sk_state == TCP_LISTEN) + cpt_dump_synwait_queue(sk, index, ctx); + + cpt_close_object(ctx); + + if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + && sk->sk_state == TCP_LISTEN) + cpt_dump_accept_queue(sk, index, ctx); + + return 0; +} + +int cpt_dump_orphaned_sockets(struct cpt_context *ctx) +{ + int i; + + cpt_open_section(ctx, CPT_SECT_ORPHANS); + + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + struct sock *sk; + struct hlist_nulls_node *node; + spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); +retry: + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) { + + if (sk->owner_env != get_exec_env()) + continue; + if (sk->sk_socket) + continue; + if (!sock_flag(sk, SOCK_DEAD)) + continue; + if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx)) + continue; + sock_hold(sk); + spin_unlock_bh(lock); + + local_bh_disable(); + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + eprintk_ctx("BUG: sk locked by whom?\n"); + sk->sk_lock.owned = 1; + bh_unlock_sock(sk); + local_bh_enable(); + + cpt_dump_socket(NULL, sk, -1, -1, ctx); + + local_bh_disable(); + bh_lock_sock(sk); + sk->sk_lock.owned = 0; + clear_backlog(sk); + tcp_done(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + + goto retry; + } + spin_unlock_bh(lock); + } + cpt_close_section(ctx); + return 0; +} + +static int can_dump(struct sock *sk, cpt_context_t *ctx) +{ + switch (sk->sk_family) { + case AF_NETLINK: + if (((struct netlink_sock *)sk)->cb) { + eprintk_ctx("netlink socket has active callback\n"); + return 0; + } + break; + } + return 1; +} + +/* We are not going to block suspend when we have external AF_UNIX connections. + * But we cannot stop feed of new packets/connections to our environment + * from outside. Taking into account that it is intrincically unreliable, + * we collect some amount of data, but when checkpointing/restoring we + * are going to drop everything, which does not make sense: skbs sent + * by outside processes, connections from outside etc. etc. + */ + +/* The first pass. When we see socket referenced by a file, we just + * add it to socket table */ +int cpt_collect_socket(struct file *file, cpt_context_t * ctx) +{ + cpt_object_t *obj; + struct socket *sock; + struct sock *sk; + + if (!S_ISSOCK(file->f_dentry->d_inode->i_mode)) + return -ENOTSOCK; + sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket; + sk = sock->sk; + if (!can_dump(sk, ctx)) + return -EAGAIN; + if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL) + return -ENOMEM; + obj->o_parent = file; + + return 0; +} + +/* + * We should end with table containing: + * * all sockets opened by our processes in the table. + * * all the sockets queued in listening queues on _our_ listening sockets, + * which are connected to our opened sockets. + */ + +static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx) +{ + struct sock *sk = obj->o_obj; + cpt_object_t *cobj; + struct sk_buff *skb; + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + struct sock *lsk = skb->sk; + if (unix_peer(lsk) && + lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) { + if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL) + return -ENOMEM; + cobj->o_parent = obj->o_parent; + } + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + + return 0; +} + +int cpt_index_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + unsigned long index = 0; + + /* Collect not-yet-accepted children of listening sockets. */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + + if (sk->sk_state != TCP_LISTEN) + continue; + + if (sk->sk_family == AF_UNIX) + collect_one_unix_listening_sock(obj, ctx); + } + + /* Assign indices to all the sockets. */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + cpt_obj_setindex(obj, index++, ctx); + + if (sk->sk_socket && sk->sk_socket->file) { + cpt_object_t *tobj; + tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx); + if (tobj) + cpt_obj_setindex(tobj, obj->o_index, ctx); + } + } + + return 0; +} + +void cpt_unlock_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + lockdep_off(); + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && obj->o_lock) { + if (sk->sk_socket) + release_sock(sk); + } + } + lockdep_on(); +} + +void cpt_kill_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && obj->o_lock) { + struct ve_struct *old_env; + old_env = set_exec_env(sk->owner_env); + cpt_kill_socket(sk, ctx); + if (sk->sk_socket) + release_sock_nobacklog(sk); + set_exec_env(old_env); + } + } +} + +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx) +{ + struct fasync_struct *fa; + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock; + + sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; + + for (fa = sock->fasync_list; fa; fa = fa->fa_next) { + if (fa->fa_file == file) + return fa->fa_fd; + } + return -1; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_socket.h linux-2.6.32.48-openvz/kernel/cpt/cpt_socket.h --- linux-2.6.32.48/kernel/cpt/cpt_socket.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_socket.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,37 @@ +struct sock; + +int cpt_collect_passedfds(cpt_context_t *); +int cpt_index_sockets(cpt_context_t *); +int cpt_collect_socket(struct file *, cpt_context_t *); +int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx); +int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx); +int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx); +int rst_sockets(struct cpt_context *ctx); +int rst_sockets_complete(struct cpt_context *ctx); +int cpt_dump_orphaned_sockets(struct cpt_context *ctx); + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx); +struct sk_buff * rst_skb(struct sock *sk, loff_t *pos_p, __u32 *owner, + __u32 *queue, struct cpt_context *ctx); + +void cpt_unlock_sockets(cpt_context_t *); +void cpt_kill_sockets(cpt_context_t *); + + +int cpt_kill_socket(struct sock *, cpt_context_t *); +int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*); +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx); +int rst_listen_socket_in(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx); +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx); +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *); +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx); +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx); +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct sock *sk, + struct cpt_context *ctx); +int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx); + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx); +int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx); diff -urNp linux-2.6.32.48/kernel/cpt/cpt_socket_in.c linux-2.6.32.48-openvz/kernel/cpt/cpt_socket_in.c --- linux-2.6.32.48/kernel/cpt/cpt_socket_in.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_socket_in.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,448 @@ +/* + * + * kernel/cpt/cpt_socket_in.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +static inline __u32 jiffies_export(unsigned long tmo) +{ + __s32 delta = (long)(tmo - jiffies); + return delta; +} + +static inline __u32 tcp_jiffies_export(__u32 tmo) +{ + __s32 delta = tmo - tcp_time_stamp; + return delta; +} + +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + struct tcp_sock *tp; + + if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP) + return 0; + + tp = tcp_sk(sk); + + skb = skb_peek(&tp->out_of_order_queue); + while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) { + int err; + + err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, sk, ctx); + if (err) + return err; + + spin_lock_irq(&tp->out_of_order_queue.lock); + skb = skb->next; + spin_unlock_irq(&tp->out_of_order_queue.lock); + } + return 0; +} + +static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk, + struct cpt_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + + si->cpt_pred_flags = tp->pred_flags; + si->cpt_rcv_nxt = tp->rcv_nxt; + si->cpt_snd_nxt = tp->snd_nxt; + si->cpt_snd_una = tp->snd_una; + si->cpt_snd_sml = tp->snd_sml; + si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp); + si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime); + si->cpt_tcp_header_len = tp->tcp_header_len; + si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending; + si->cpt_quick = inet_csk(sk)->icsk_ack.quick; + si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong; + si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked; + si->cpt_ato = inet_csk(sk)->icsk_ack.ato; + si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout); + si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime); + si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size; + si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss; + si->cpt_snd_wl1 = tp->snd_wl1; + si->cpt_snd_wnd = tp->snd_wnd; + si->cpt_max_window = tp->max_window; + si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie; + si->cpt_mss_cache = tp->mss_cache; + si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */ + si->cpt_mss_clamp = tp->rx_opt.mss_clamp; + si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len; + si->cpt_ext2_header_len = 0; + si->cpt_ca_state = inet_csk(sk)->icsk_ca_state; + si->cpt_retransmits = inet_csk(sk)->icsk_retransmits; + si->cpt_reordering = tp->reordering; + si->cpt_frto_counter = tp->frto_counter; + si->cpt_frto_highmark = tp->frto_highmark; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + // // si->cpt_adv_cong = tp->adv_cong; +#endif + si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept; + si->cpt_backoff = inet_csk(sk)->icsk_backoff; + si->cpt_srtt = tp->srtt; + si->cpt_mdev = tp->mdev; + si->cpt_mdev_max = tp->mdev_max; + si->cpt_rttvar = tp->rttvar; + si->cpt_rtt_seq = tp->rtt_seq; + si->cpt_rto = inet_csk(sk)->icsk_rto; + si->cpt_packets_out = tp->packets_out; + si->cpt_left_out = tp->sacked_out + tp->lost_out; + si->cpt_retrans_out = tp->retrans_out; + si->cpt_lost_out = tp->lost_out; + si->cpt_sacked_out = tp->sacked_out; + si->cpt_fackets_out = tp->fackets_out; + si->cpt_snd_ssthresh = tp->snd_ssthresh; + si->cpt_snd_cwnd = tp->snd_cwnd; + si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt; + si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp; + si->cpt_snd_cwnd_used = tp->snd_cwnd_used; + si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp); + si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout); + si->cpt_ka_timeout = 0; + si->cpt_rcv_wnd = tp->rcv_wnd; + si->cpt_rcv_wup = tp->rcv_wup; + si->cpt_write_seq = tp->write_seq; + si->cpt_pushed_seq = tp->pushed_seq; + si->cpt_copied_seq = tp->copied_seq; + si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok; + si->cpt_wscale_ok = tp->rx_opt.wscale_ok; + si->cpt_sack_ok = tp->rx_opt.sack_ok; + si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp; + si->cpt_snd_wscale = tp->rx_opt.snd_wscale; + si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale; + si->cpt_nonagle = tp->nonagle; + si->cpt_keepalive_probes = tp->keepalive_probes; + si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval; + si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr; + si->cpt_ts_recent = tp->rx_opt.ts_recent; + si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + si->cpt_user_mss = tp->rx_opt.user_mss; + si->cpt_dsack = tp->rx_opt.dsack; + si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq; + si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq; + si->cpt_sack_array[2] = tp->selective_acks[0].start_seq; + si->cpt_sack_array[3] = tp->selective_acks[0].end_seq; + si->cpt_sack_array[4] = tp->selective_acks[1].start_seq; + si->cpt_sack_array[5] = tp->selective_acks[1].end_seq; + si->cpt_sack_array[6] = tp->selective_acks[2].start_seq; + si->cpt_sack_array[7] = tp->selective_acks[2].end_seq; + si->cpt_sack_array[8] = tp->selective_acks[3].start_seq; + si->cpt_sack_array[9] = tp->selective_acks[3].end_seq; + si->cpt_window_clamp = tp->window_clamp; + si->cpt_rcv_ssthresh = tp->rcv_ssthresh; + si->cpt_probes_out = inet_csk(sk)->icsk_probes_out; + si->cpt_num_sacks = tp->rx_opt.num_sacks; + si->cpt_advmss = tp->advmss; + si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries; + si->cpt_ecn_flags = tp->ecn_flags; + si->cpt_prior_ssthresh = tp->prior_ssthresh; + si->cpt_high_seq = tp->high_seq; + si->cpt_retrans_stamp = tp->retrans_stamp; + si->cpt_undo_marker = tp->undo_marker; + si->cpt_undo_retrans = tp->undo_retrans; + si->cpt_urg_seq = tp->urg_seq; + si->cpt_urg_data = tp->urg_data; + si->cpt_pending = inet_csk(sk)->icsk_pending; + si->cpt_snd_up = tp->snd_up; + si->cpt_keepalive_time = tp->keepalive_time; + si->cpt_keepalive_intvl = tp->keepalive_intvl; + si->cpt_linger2 = tp->linger2; + + if (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_CLOSE && + sock_flag(sk, SOCK_KEEPOPEN)) { + si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires); + } + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + { + extern struct inet_connection_sock_af_ops ipv6_mapped; + if (sk->sk_family == AF_INET6 && + inet_csk(sk)->icsk_af_ops == &ipv6_mapped) + si->cpt_mapped = 1; + } +#endif + + return 0; +} + + +int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk, + struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + if (sk->sk_family == AF_INET) { + struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr); + sin->sin_family = AF_INET; + sin->sin_port = inet->sport; + sin->sin_addr.s_addr = inet->rcv_saddr; + si->cpt_laddrlen = sizeof(*sin); + } else if (sk->sk_family == AF_INET6) { + struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = inet->sport; + memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16); + si->cpt_laddrlen = sizeof(*sin6); + } + if (!inet->num) + si->cpt_laddrlen = 0; + + si->cpt_daddr = inet->daddr; + si->cpt_dport = inet->dport; + si->cpt_saddr = inet->saddr; + si->cpt_rcv_saddr = inet->rcv_saddr; + si->cpt_sport = inet->sport; + si->cpt_uc_ttl = inet->uc_ttl; + si->cpt_tos = inet->tos; + si->cpt_cmsg_flags = inet->cmsg_flags; + si->cpt_mc_index = inet->mc_index; + si->cpt_mc_addr = inet->mc_addr; + si->cpt_hdrincl = inet->hdrincl; + si->cpt_mc_ttl = inet->mc_ttl; + si->cpt_mc_loop = inet->mc_loop; + si->cpt_pmtudisc = inet->pmtudisc; + si->cpt_recverr = inet->recverr; + si->cpt_freebind = inet->freebind; + si->cpt_idcounter = inet->id; + + si->cpt_cork_flags = inet->cork.flags; + si->cpt_cork_fragsize = 0; + si->cpt_cork_length = inet->cork.length; + si->cpt_cork_addr = inet->cork.addr; + si->cpt_cork_saddr = inet->cork.fl.fl4_src; + si->cpt_cork_daddr = inet->cork.fl.fl4_dst; + si->cpt_cork_oif = inet->cork.fl.oif; + if (inet->cork.dst) { + struct rtable *rt = (struct rtable *)inet->cork.dst; + si->cpt_cork_fragsize = inet->cork.fragsize; + si->cpt_cork_saddr = rt->fl.fl4_src; + si->cpt_cork_daddr = rt->fl.fl4_dst; + si->cpt_cork_oif = rt->fl.oif; + } + + if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { + struct udp_sock *up = udp_sk(sk); + si->cpt_udp_pending = up->pending; + si->cpt_udp_corkflag = up->corkflag; + si->cpt_udp_encap = up->encap_type; + si->cpt_udp_len = up->len; + } + + if (sk->sk_family == AF_INET6) { + memcpy(si->cpt_saddr6, &np->saddr, 16); + memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16); + memcpy(si->cpt_daddr6, &np->daddr, 16); + si->cpt_flow_label6 = np->flow_label; + si->cpt_frag_size6 = np->frag_size; + si->cpt_hop_limit6 = np->hop_limit; + si->cpt_mcast_hops6 = np->mcast_hops; + si->cpt_mcast_oif6 = np->mcast_oif; + si->cpt_rxopt6 = np->rxopt.all; + si->cpt_mc_loop6 = np->mc_loop; + si->cpt_recverr6 = np->recverr; + si->cpt_sndflow6 = np->sndflow; + si->cpt_pmtudisc6 = np->pmtudisc; + si->cpt_ipv6only6 = np->ipv6only; + si->cpt_mapped = 0; + } + + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + cpt_dump_socket_tcp(si, sk, ctx); + + return 0; +} + +int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx) +{ + struct request_sock *req; + + for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next) + cpt_dump_socket(NULL, req->sk, -1, index, ctx); + return 0; +} + + +static int dump_openreq(struct request_sock *req, struct sock *sk, int index, + struct cpt_context *ctx) +{ + struct cpt_openreq_image *v = cpt_get_buf(ctx); + + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_OPENREQ; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn; + v->cpt_snt_isn = tcp_rsk(req)->snt_isn; + v->cpt_rmt_port = inet_rsk(req)->rmt_port; + v->cpt_mss = req->mss; + v->cpt_family = req->rsk_ops->family; + v->cpt_retrans = req->retrans; + v->cpt_snd_wscale = inet_rsk(req)->snd_wscale; + v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale; + v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok; + v->cpt_sack_ok = inet_rsk(req)->sack_ok; + v->cpt_wscale_ok = inet_rsk(req)->wscale_ok; + v->cpt_ecn_ok = inet_rsk(req)->ecn_ok; + v->cpt_acked = inet_rsk(req)->acked; + v->cpt_window_clamp = req->window_clamp; + v->cpt_rcv_wnd = req->rcv_wnd; + v->cpt_ts_recent = req->ts_recent; + v->cpt_expires = jiffies_export(req->expires); + + if (v->cpt_family == AF_INET) { + memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4); + memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4); + } else { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16); + memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16); + v->cpt_iif = inet6_rsk(req)->iif; +#endif + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx) +{ + struct inet_connection_sock *icsk; + struct listen_sock *lopt; + struct request_sock *req; + int nr_entries; + int i; + + icsk = inet_csk(sk); + lopt = icsk->icsk_accept_queue.listen_opt; + nr_entries = icsk->icsk_accept_queue.listen_opt->nr_table_entries; + + for (i=0; i < nr_entries; i++) { + for (req=lopt->syn_table[i]; req; req=req->dl_next) { + loff_t saved_obj; + cpt_push_object(&saved_obj, ctx); + dump_openreq(req, sk, index, ctx); + cpt_pop_object(&saved_obj, ctx); + } + } + return 0; +} + + +int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx) +{ + if (sk->sk_state != TCP_CLOSE && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && + sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + tcp_set_state(sk, TCP_CLOSE); + else + sk->sk_prot->disconnect(sk, 0); + } + return 0; +} + +int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml; + + for (iml = inet->mc_list; iml; iml = iml->next) { + struct cpt_sockmc_image smi; + int scnt = 0; + int i; + + if (iml->sflist) + scnt = iml->sflist->sl_count*16; + + smi.cpt_next = sizeof(smi) + scnt; + smi.cpt_object = CPT_OBJ_SOCK_MCADDR; + smi.cpt_hdrlen = sizeof(smi); + smi.cpt_content = CPT_CONTENT_DATA; + + smi.cpt_family = AF_INET; + smi.cpt_mode = iml->sfmode; + smi.cpt_ifindex = iml->multi.imr_ifindex; + memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr)); + smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr; + + ctx->write(&smi, sizeof(smi), ctx); + + for (i = 0; i < scnt; i++) { + u32 addr[4]; + memset(&addr, 0, sizeof(addr)); + addr[0] = iml->sflist->sl_addr[i]; + ctx->write(&addr, sizeof(addr), ctx); + } + } + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->sk_family == AF_INET6) { + struct ipv6_mc_socklist *mcl; + struct ipv6_pinfo *np = inet6_sk(sk); + + for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) { + struct cpt_sockmc_image smi; + int scnt = 0; + int i; + + if (mcl->sflist) + scnt = mcl->sflist->sl_count*16; + + smi.cpt_next = sizeof(smi) + scnt; + smi.cpt_object = CPT_OBJ_SOCK_MCADDR; + smi.cpt_hdrlen = sizeof(smi); + smi.cpt_content = CPT_CONTENT_DATA; + + smi.cpt_family = AF_INET6; + smi.cpt_mode = mcl->sfmode; + smi.cpt_ifindex = mcl->ifindex; + memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr)); + + ctx->write(&smi, sizeof(smi), ctx); + for (i = 0; i < scnt; i++) + ctx->write(&mcl->sflist->sl_addr[i], 16, ctx); + } + } +#endif + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_syscalls.h linux-2.6.32.48-openvz/kernel/cpt/cpt_syscalls.h --- linux-2.6.32.48/kernel/cpt/cpt_syscalls.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_syscalls.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,101 @@ +#include +#include +#include +#include + +#define WRAP(c, args) return sys_##c args +#define WRAP2(c, args) int err; mm_segment_t oldfs; \ + oldfs = get_fs(); set_fs(KERNEL_DS); \ + err = sys_##c args ;\ + set_fs(oldfs); \ + return err + +static inline int sc_close(int fd) +{ + WRAP(close, (fd)); +} + +static inline int sc_dup2(int fd1, int fd2) +{ + WRAP(dup2, (fd1, fd2)); +} + +static inline int sc_unlink(char *name) +{ + WRAP2(unlink, (name)); +} + +static inline int sc_pipe(int *pfd) +{ + return do_pipe_flags(pfd, 0); +} + +static inline int sc_mknod(char *name, int mode, int dev) +{ + WRAP2(mknod, (name, mode, dev)); +} + +static inline int sc_chmod(char *name, int mode) +{ + WRAP2(mkdir, (name, mode)); +} + +static inline int sc_chown(char *name, int uid, int gid) +{ + WRAP2(chown, (name, uid, gid)); +} + +static inline int sc_mkdir(char *name, int mode) +{ + WRAP2(mkdir, (name, mode)); +} + +static inline int sc_rmdir(char *name) +{ + WRAP2(rmdir, (name)); +} + +static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags) +{ + WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL)); +} + +static inline int sc_mprotect(unsigned long start, size_t len, + unsigned long prot) +{ + WRAP(mprotect, (start, len, prot)); +} + +static inline int sc_mlock(unsigned long start, size_t len) +{ + WRAP(mlock, (start, len)); +} + +static inline int sc_munlock(unsigned long start, size_t len) +{ + WRAP(munlock, (start, len)); +} + +static inline int sc_remap_file_pages(unsigned long start, size_t len, + unsigned long prot, unsigned long pgoff, + unsigned long flags) +{ + WRAP(remap_file_pages, (start, len, prot, pgoff, flags)); +} + +static inline int sc_waitx(int pid, int opt, int *stat_addr) +{ + WRAP(wait4, (pid, stat_addr, opt, NULL)); +} + +static inline int sc_flock(int fd, int flags) +{ + WRAP(flock, (fd, flags)); +} + +static inline int sc_open(char* path, int flags, int mode) +{ + WRAP(open, (path, flags, mode)); +} + +extern int sc_execve(char *cms, char **argv, char **env); diff -urNp linux-2.6.32.48/kernel/cpt/cpt_sysvipc.c linux-2.6.32.48-openvz/kernel/cpt/cpt_sysvipc.c --- linux-2.6.32.48/kernel/cpt/cpt_sysvipc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_sysvipc.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,403 @@ +/* + * + * kernel/cpt/cpt_sysvipc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" + +struct _warg { + struct file *file; + struct cpt_sysvshm_image *v; +}; + +static int dump_one_shm(struct shmid_kernel *shp, void *arg) +{ + struct _warg *warg = arg; + struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v; + + if (shp->shm_file != warg->file) + return 0; + + v->cpt_key = shp->shm_perm.key; + v->cpt_uid = shp->shm_perm.uid; + v->cpt_gid = shp->shm_perm.gid; + v->cpt_cuid = shp->shm_perm.cuid; + v->cpt_cgid = shp->shm_perm.cgid; + v->cpt_mode = shp->shm_perm.mode; + v->cpt_seq = shp->shm_perm.seq; + + v->cpt_id = shp->shm_perm.id; + v->cpt_segsz = shp->shm_segsz; + v->cpt_atime = shp->shm_atim; + v->cpt_ctime = shp->shm_ctim; + v->cpt_dtime = shp->shm_dtim; + v->cpt_creator = shp->shm_cprid; + v->cpt_last = shp->shm_lprid; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1; +#else + v->cpt_mlockuser = -1; +#endif + return 1; +} + +int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx) +{ + struct cpt_sysvshm_image *v = cpt_get_buf(ctx); + struct _warg warg; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSV_SHM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + warg.file = file; + warg.v = v; + if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) { + cpt_release_buf(ctx); + return -ESRCH; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + + +int match_sem(int id, struct sem_array *sema, void *arg) +{ + if (id != (unsigned long)arg) + return 0; + return sema->sem_nsems + 1; +} + +static int get_sem_nsem(int id, cpt_context_t *ctx) +{ + int res; + res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id); + if (res > 0) + return res - 1; + eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id); + return -ESRCH; +} + +static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx) +{ + struct cpt_sysvsem_undo_image v; + loff_t saved_obj; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_SEMUNDO; + v.cpt_id = su->semid; + v.cpt_nsem = get_sem_nsem(su->semid, ctx); + if ((int)v.cpt_nsem < 0) + return -ESRCH; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + return 0; +} + +struct sem_warg { + int last_id; + struct cpt_sysvsem_image *v; +}; + +static int dump_one_sem(int id, struct sem_array *sma, void *arg) +{ + struct sem_warg * warg = (struct sem_warg *)arg; + struct cpt_sysvsem_image *v = warg->v; + int i; + + if (warg->last_id != -1) { + if ((id % IPCMNI) <= warg->last_id) + return 0; + } + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSV_SEM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_SEMARRAY; + + v->cpt_key = sma->sem_perm.key; + v->cpt_uid = sma->sem_perm.uid; + v->cpt_gid = sma->sem_perm.gid; + v->cpt_cuid = sma->sem_perm.cuid; + v->cpt_cgid = sma->sem_perm.cgid; + v->cpt_mode = sma->sem_perm.mode; + v->cpt_seq = sma->sem_perm.seq; + + v->cpt_id = id; + v->cpt_ctime = sma->sem_ctime; + v->cpt_otime = sma->sem_otime; + + for (i=0; isem_nsems; i++) { + struct { + __u32 semval; + __u32 sempid; + } *s = (void*)v + v->cpt_next; + if (v->cpt_next >= PAGE_SIZE - sizeof(*s)) + return -EINVAL; + s->semval = sma->sem_base[i].semval; + s->sempid = sma->sem_base[i].sempid; + v->cpt_next += sizeof(*s); + } + + warg->last_id = id % IPCMNI; + return 1; +} + + +int cpt_dump_sysvsem(struct cpt_context *ctx) +{ + cpt_object_t *obj; + struct sem_warg warg; + + /* Dumping semaphores is quite tricky because we cannot + * write to dump file under lock inside sysvipc_walk_sem(). + */ + cpt_open_section(ctx, CPT_SECT_SYSV_SEM); + warg.last_id = -1; + warg.v = cpt_get_buf(ctx); + for (;;) { + if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0) + break; + ctx->write(warg.v, warg.v->cpt_next, ctx); + } + cpt_release_buf(ctx); + cpt_close_section(ctx); + + cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO); + for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { + struct sem_undo_list *semu = obj->o_obj; + struct sem_undo *su; + struct cpt_object_hdr v; + loff_t saved_obj; + + cpt_open_object(obj, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SYSVSEM_UNDO; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(su, &semu->list_proc, list_proc) { + if (su->semid != -1) { + int err; + err = dump_one_semundo(su, ctx); + if (err < 0) + return err; + } + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + } + cpt_close_section(ctx); + return 0; +} + +struct msg_warg { + int last_id; + struct msg_queue *msq; + struct cpt_sysvmsg_image *v; +}; + +static int dump_one_msg(int id, struct msg_queue *msq, void *arg) +{ + struct msg_warg * warg = (struct msg_warg *)arg; + struct cpt_sysvmsg_image *v = warg->v; + + if (warg->last_id != -1) { + if ((id % IPCMNI) <= warg->last_id) + return 0; + } + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSVMSG; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_key = msq->q_perm.key; + v->cpt_uid = msq->q_perm.uid; + v->cpt_gid = msq->q_perm.gid; + v->cpt_cuid = msq->q_perm.cuid; + v->cpt_cgid = msq->q_perm.cgid; + v->cpt_mode = msq->q_perm.mode; + v->cpt_seq = msq->q_perm.seq; + + v->cpt_id = id; + v->cpt_stime = msq->q_stime; + v->cpt_rtime = msq->q_rtime; + v->cpt_ctime = msq->q_ctime; + v->cpt_last_sender = msq->q_lspid; + v->cpt_last_receiver = msq->q_lrpid; + v->cpt_qbytes = msq->q_qbytes; + + warg->msq = msq; + warg->last_id = id % IPCMNI; + return 1; +} + +static int do_store(void * src, int len, int offset, void * data) +{ + cpt_context_t * ctx = data; + ctx->write(src, len, ctx); + return 0; +} + +static void cpt_dump_one_sysvmsg(struct msg_msg *m, cpt_context_t * ctx) +{ + loff_t saved_obj; + struct cpt_sysvmsg_msg_image mv; + + cpt_open_object(NULL, ctx); + mv.cpt_next = CPT_NULL; + mv.cpt_object = CPT_OBJ_SYSVMSG_MSG; + mv.cpt_hdrlen = sizeof(mv); + mv.cpt_content = CPT_CONTENT_DATA; + + mv.cpt_type = m->m_type; + mv.cpt_size = m->m_ts; + + ctx->write(&mv, sizeof(mv), ctx); + + cpt_push_object(&saved_obj, ctx); + sysv_msg_store(m, do_store, m->m_ts, ctx); + cpt_pop_object(&saved_obj, ctx); + cpt_close_object(ctx); +} + +int cpt_dump_sysvmsg(struct cpt_context *ctx) +{ + struct msg_warg warg; + + /* Dumping msg queues is tricky because we cannot + * write to dump file under lock inside sysvipc_walk_msg(). + * + * And even worse, we have to access msg list in an unserialized + * context. It is fragile. But VE is still frozen, remember? + */ + cpt_open_section(ctx, CPT_SECT_SYSV_MSG); + warg.last_id = -1; + warg.v = cpt_get_buf(ctx); + for (;;) { + loff_t saved_obj; + struct msg_msg * m; + + if (sysvipc_walk_msg(dump_one_msg, &warg) <= 0) + break; + + cpt_open_object(NULL, ctx); + + ctx->write(warg.v, warg.v->cpt_next, ctx); + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(m, &warg.msq->q_messages, m_list) { + cpt_dump_one_sysvmsg(m, ctx); + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + } + cpt_release_buf(ctx); + cpt_close_section(ctx); + return 0; +} + +static int cpt_collect_sysvsem_undo(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->exit_state) { + /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list + * on exit. Grrr... */ + continue; + } + if (tsk->sysvsem.undo_list && + cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL) + return -ENOMEM; + } + + for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { + struct sem_undo_list *semu = obj->o_obj; + + if (atomic_read(&semu->refcnt) != obj->o_count) { + eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt)); + return -EBUSY; + } + } + return 0; +} + +static int collect_one_shm(struct shmid_kernel *shp, void *arg) +{ + cpt_context_t *ctx = arg; + + if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL) + return -ENOMEM; + return 0; +} + +int cpt_collect_sysvshm(cpt_context_t * ctx) +{ + int err; + + err = sysvipc_walk_shm(collect_one_shm, ctx); + + return err < 0 ? err : 0; +} + +int cpt_collect_sysv(cpt_context_t * ctx) +{ + int err; + + err = cpt_collect_sysvsem_undo(ctx); + if (err) + return err; + err = cpt_collect_sysvshm(ctx); + if (err) + return err; + + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_tty.c linux-2.6.32.48-openvz/kernel/cpt/cpt_tty.c --- linux-2.6.32.48/kernel/cpt/cpt_tty.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_tty.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,215 @@ +/* + * + * kernel/cpt/cpt_tty.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +/* We must support at least N_TTY. */ + +int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx) +{ + struct tty_struct *tty = file->private_data; + cpt_object_t *obj; + struct cpt_obj_ref o; + loff_t saved_pos; + + obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx); + if (!obj) + return -EINVAL; + + cpt_push_object(&saved_pos, ctx); + + o.cpt_next = sizeof(o); + o.cpt_object = CPT_OBJ_REF; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_VOID; + o.cpt_pos = obj->o_pos; + ctx->write(&o, sizeof(o), ctx); + + cpt_pop_object(&saved_pos, ctx); + + return 0; +} + +int cpt_collect_tty(struct file *file, cpt_context_t * ctx) +{ + struct tty_struct *tty = file->private_data; + + if (tty) { + if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL) + return -ENOMEM; + if (tty->link) { + cpt_object_t *obj; + + obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx); + if (obj == NULL) + return -ENOMEM; + /* Undo o_count, tty->link is not a reference */ + obj->o_count--; + } + } + return 0; +} + +int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct tty_struct *tty = obj->o_obj; + struct cpt_tty_image *v; + + if (tty->link) { + if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) { + eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE); + return -EINVAL; + } + if (tty->link->link != tty) { + eprintk_ctx("bad pty pair\n"); + return -EINVAL; + } + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_SLAVE && + tty->link->count) + obj->o_count++; + } + if (obj->o_count != tty->count) { + eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count); + return -EBUSY; + } + + cpt_open_object(obj, ctx); + + v = cpt_get_buf(ctx); + v->cpt_next = -1; + v->cpt_object = CPT_OBJ_TTY; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_index = tty->index; + v->cpt_link = -1; + if (tty->link) + v->cpt_link = tty->link->index; + v->cpt_drv_type = tty->driver->type; + v->cpt_drv_subtype = tty->driver->subtype; + v->cpt_drv_flags = tty->driver->flags; + v->cpt_packet = tty->packet; + v->cpt_stopped = tty->stopped; + v->cpt_hw_stopped = tty->hw_stopped; + v->cpt_flow_stopped = tty->flow_stopped; + v->cpt_flags = tty->flags; + v->cpt_ctrl_status = tty->ctrl_status; + v->cpt_canon_data = tty->canon_data; + v->cpt_canon_head = tty->canon_head - tty->read_tail; + v->cpt_canon_column = tty->canon_column; + v->cpt_column = tty->column; + v->cpt_erasing = tty->erasing; + v->cpt_lnext = tty->lnext; + v->cpt_icanon = tty->icanon; + v->cpt_raw = tty->raw; + v->cpt_real_raw = tty->real_raw; + v->cpt_closing = tty->closing; + v->cpt_minimum_to_wake = tty->minimum_to_wake; + v->cpt_pgrp = 0; + if (tty->pgrp) { + v->cpt_pgrp = pid_vnr(tty->pgrp); + if ((int)v->cpt_pgrp < 0) { + dprintk_ctx("cannot map tty->pgrp %d -> %d\n", pid_vnr(tty->pgrp), (int)v->cpt_pgrp); + v->cpt_pgrp = -1; + } + } + v->cpt_session = 0; + if (tty->session) { + v->cpt_session = pid_vnr(tty->session); + if ((int)v->cpt_session < 0) { + eprintk_ctx("cannot map tty->session %d -> %d\n", pid_nr(tty->session), (int)v->cpt_session); + cpt_release_buf(ctx); + return -EINVAL; + } + } + memcpy(v->cpt_name, tty->name, 64); + v->cpt_ws_row = tty->winsize.ws_row; + v->cpt_ws_col = tty->winsize.ws_col; + v->cpt_ws_prow = tty->winsize.ws_ypixel; + v->cpt_ws_pcol = tty->winsize.ws_xpixel; + if (tty->termios == NULL) { + eprintk_ctx("NULL termios"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_c_line = tty->termios->c_line; + v->cpt_c_iflag = tty->termios->c_iflag; + v->cpt_c_oflag = tty->termios->c_oflag; + v->cpt_c_cflag = tty->termios->c_cflag; + v->cpt_c_lflag = tty->termios->c_lflag; + memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS); + if (NCCS < 32) + memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS); + memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags)); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (tty->read_buf && tty->read_cnt) { + struct cpt_obj_bits *v = cpt_get_buf(ctx); + loff_t saved_pos; + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = tty->read_cnt; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (tty->read_cnt) { + int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail); + ctx->write(tty->read_buf + tty->read_tail, n, ctx); + if (tty->read_cnt > n) + ctx->write(tty->read_buf, tty->read_cnt-n, ctx); + ctx->align(ctx); + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + } + + cpt_close_object(ctx); + + return 0; +} + +__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx) +{ + struct tty_struct * tty; + struct fasync_struct *fa; + + tty = (struct tty_struct *)file->private_data; + + for (fa = tty->fasync; fa; fa = fa->fa_next) { + if (fa->fa_file == file) + return fa->fa_fd; + } + return -1; +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_ubc.c linux-2.6.32.48-openvz/kernel/cpt/cpt_ubc.c --- linux-2.6.32.48/kernel/cpt/cpt_ubc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_ubc.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,135 @@ +/* + * + * kernel/cpt/cpt_ubc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx); + if (obj != NULL) { + if (obj->o_count == 1) + get_beancounter(bc); + if (bc->parent != NULL && obj->o_parent == NULL) + obj->o_parent = cpt_add_ubc(bc->parent, ctx); + } + return obj; +} + +__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx); + if (obj == NULL) { + char buf[48]; + print_ub_uid(bc, buf, sizeof(buf)); + eprintk("CPT: unknown ub %s (%p)\n", buf, bc); + dump_stack(); + return CPT_NULL; + } + return obj->o_pos; +} + +static void dump_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, + int held) +{ + dmp->barrier = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL); + dmp->limit = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL); + dmp->held = (held ? prm->held : CPT_NULL); + dmp->maxheld = prm->maxheld; + dmp->minheld = prm->minheld; + dmp->failcnt = prm->failcnt; +} + +static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct user_beancounter *bc; + struct cpt_beancounter_image *v; + int i; + + bc = obj->o_obj; + v = cpt_get_buf(ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_UBC; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + if (obj->o_parent != NULL) + v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos; + else + v->cpt_parent = CPT_NULL; + v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0; + v->cpt_ub_resources = UB_RESOURCES; + BUILD_BUG_ON(ARRAY_SIZE(v->cpt_parms) < UB_RESOURCES * 2); + for (i = 0; i < UB_RESOURCES; i++) { + dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1); + } + memset(v->cpt_parms + UB_RESOURCES * 2, 0, + sizeof(v->cpt_parms) + - UB_RESOURCES * 2 * sizeof(v->cpt_parms[0])); + + cpt_open_object(obj, ctx); + ctx->write(v, sizeof(*v), ctx); + cpt_close_object(ctx); + + cpt_release_buf(ctx); + return 0; +} + +int cpt_dump_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int skipped; + int top; + + cpt_open_section(ctx, CPT_SECT_UBC); + + do { + skipped = 0; + top = 0; + for_each_object(obj, CPT_OBJ_UBC) { + if (obj->o_parent == NULL) + top++; + if (obj->o_pos != CPT_NULL) + continue; + if (obj->o_parent != NULL && + ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL) + skipped++; + else + dump_one_bc(obj, ctx); + } + } while (skipped && (top < 2)); + + cpt_close_section(ctx); + if (top > 1) { + eprintk_ctx("More than one top level ub exist"); + return -EINVAL; + } + + return 0; +} + +void cpt_finish_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_UBC) + put_beancounter(obj->o_obj); +} diff -urNp linux-2.6.32.48/kernel/cpt/cpt_ubc.h linux-2.6.32.48-openvz/kernel/cpt/cpt_ubc.h --- linux-2.6.32.48/kernel/cpt/cpt_ubc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_ubc.h 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,23 @@ +#ifdef CONFIG_BEANCOUNTERS +cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx); +__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx); +int cpt_dump_ubc(struct cpt_context *ctx); + +struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx); +int rst_undump_ubc(struct cpt_context *ctx); + +void cpt_finish_ubc(struct cpt_context *ctx); +void rst_finish_ubc(struct cpt_context *ctx); +void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id); +void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id); +#else +static int inline cpt_dump_ubc(struct cpt_context *ctx) +{ return 0; } +static int inline rst_undump_ubc(struct cpt_context *ctx) +{ return 0; } +static void inline cpt_finish_ubc(struct cpt_context *ctx) +{ return; } +static void inline rst_finish_ubc(struct cpt_context *ctx) +{ return; } +#endif + diff -urNp linux-2.6.32.48/kernel/cpt/cpt_x8664.S linux-2.6.32.48-openvz/kernel/cpt/cpt_x8664.S --- linux-2.6.32.48/kernel/cpt/cpt_x8664.S 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/cpt_x8664.S 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,67 @@ +#define ASSEMBLY 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .code64 + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorq %rax, %rax + pushq %rax /* ss */ + pushq %rax /* rsp */ + pushq $(1<<9) /* eflags - interrupts on */ + pushq $__KERNEL_CS /* cs */ + pushq \child_rip /* rip */ + pushq %rax /* orig rax */ + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + .endm + +ENTRY(asm_kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq $0x00800000,%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + pushq %rcx + call do_fork_pid + addq $8, %rsp + /* call do_fork */ + movq %rax,RAX(%rsp) + xorl %edi,%edi + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +ENDPROC(asm_kernel_thread) + +child_rip: + pushq $0 # fake return address + CFI_STARTPROC + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + movq %rax, %rdi + call do_exit + CFI_ENDPROC +ENDPROC(child_rip) + diff -urNp linux-2.6.32.48/kernel/cpt/Makefile linux-2.6.32.48-openvz/kernel/cpt/Makefile --- linux-2.6.32.48/kernel/cpt/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/Makefile 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,53 @@ +# +# +# kernel/cpt/Makefile +# +# Copyright (C) 2000-2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o + +vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \ + cpt_mm.o cpt_files.o cpt_kernel.o \ + cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \ + cpt_conntrack.o cpt_epoll.o + +vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \ + rst_mm.o rst_files.o \ + rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \ + rst_conntrack.o rst_epoll.o + +ifeq ($(CONFIG_BEANCOUNTERS), y) +vzcpt-objs += cpt_ubc.o +vzrst-objs += rst_ubc.o +endif + +ifeq ($(CONFIG_INOTIFY_USER), y) +vzcpt-objs += cpt_inotify.o +vzrst-objs += rst_inotify.o +endif + +vzrst-objs += cpt_exports.o + +ifeq ($(CONFIG_VZ_CHECKPOINT), m) +vzrst-objs += cpt_obj.o cpt_kernel.o +endif + +ifeq ($(CONFIG_VZ_CHECKPOINT_ITER), y) +vzcpt-objs += cpt_iterative.o +vzrst-objs += rst_iterative.o +endif + +ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y) +vzcpt-objs += cpt_pagein.o +vzrst-objs += rst_pagein.o +endif + +ifeq ($(CONFIG_X86_64), y) +vzcpt-objs += cpt_x8664.o +ifeq ($(CONFIG_VZ_CHECKPOINT), m) +vzrst-objs += cpt_x8664.o +endif +endif diff -urNp linux-2.6.32.48/kernel/cpt/rst_conntrack.c linux-2.6.32.48-openvz/kernel/cpt/rst_conntrack.c --- linux-2.6.32.48/kernel/cpt/rst_conntrack.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_conntrack.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,328 @@ +/* + * + * kernel/cpt/rst_conntrack.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_READ_LOCK(x) do { } while (0) +#define ASSERT_WRITE_LOCK(x) do { } while (0) + + +#include "cpt_obj.h" +#include "cpt_context.h" + +struct ct_holder +{ + struct ct_holder *next; + struct ip_conntrack *ct; + int index; +}; + +static int decode_tuple(struct cpt_ipct_tuple *v, + struct ip_conntrack_tuple *tuple, int dir, + cpt_context_t *ctx) +{ + tuple->dst.ip = v->cpt_dst; + tuple->dst.u.all = v->cpt_dstport; + if (ctx->image_version < CPT_VERSION_16) { + /* In 2.6.9 kernel protonum has short type */ + __u16 protonum = *(__u16 *)&v->cpt_protonum; + if (protonum > 0xff && protonum < 0xffff) { + eprintk_ctx("tuple: protonum > 255: %u\n", protonum); + return -EINVAL; + } + tuple->dst.protonum = protonum; + tuple->dst.dir = dir; + } else { + tuple->dst.protonum = v->cpt_protonum; + tuple->dst.dir = v->cpt_dir; + if (dir != tuple->dst.dir) { + eprintk_ctx("dir != tuple->dst.dir\n"); + return -EINVAL; + } + } + + tuple->src.ip = v->cpt_src; + tuple->src.u.all = v->cpt_srcport; + return 0; +} + + +static int undump_expect_list(struct ip_conntrack *ct, + struct cpt_ip_conntrack_image *ci, + loff_t pos, struct ct_holder *ct_list, + cpt_context_t *ctx) +{ + loff_t end; + int err; + + end = pos + ci->cpt_next; + pos += ci->cpt_hdrlen; + while (pos < end) { + struct cpt_ip_connexpect_image v; + struct ip_conntrack_expect *exp; + struct ip_conntrack *sibling; + + err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx); + if (err) + return err; + + sibling = NULL; + if (v.cpt_sibling_conntrack) { + struct ct_holder *c; + + for (c = ct_list; c; c = c->next) { + if (c->index == v.cpt_sibling_conntrack) { + sibling = c->ct; + break; + } + } + if (!sibling) { + eprintk_ctx("lost sibling of expectation\n"); + return -EINVAL; + } + } + + write_lock_bh(&ip_conntrack_lock); + + /* It is possible. Helper module could be just unregistered, + * if expectation were on the list, it would be destroyed. */ + if (ct->helper == NULL) { + write_unlock_bh(&ip_conntrack_lock); + dprintk_ctx("conntrack: no helper and non-trivial expectation\n"); + continue; + } + + exp = ip_conntrack_expect_alloc(NULL); + if (exp == NULL) { + write_unlock_bh(&ip_conntrack_lock); + return -ENOMEM; + } + + if (decode_tuple(&v.cpt_tuple, &exp->tuple, 0, ctx) || + decode_tuple(&v.cpt_mask, &exp->mask, 0, ctx)) { + ip_conntrack_expect_put(exp); + write_unlock_bh(&ip_conntrack_lock); + return -EINVAL; + } + + exp->master = ct; + nf_conntrack_get(&ct->ct_general); + ip_conntrack_expect_insert(exp); +#if 0 + if (sibling) { + exp->sibling = sibling; + sibling->master = exp; + LIST_DELETE(&ve_ip_conntrack_expect_list, exp); + ct->expecting--; + nf_conntrack_get(&master_ct(sibling)->infos[0]); + } else +#endif + if (ct->helper->timeout) { + mod_timer(&exp->timeout, jiffies + v.cpt_timeout); + } + write_unlock_bh(&ip_conntrack_lock); + + ip_conntrack_expect_put(exp); + + pos += v.cpt_next; + } + return 0; +} + +static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos, + struct ct_holder **ct_list, cpt_context_t *ctx) +{ + int err = 0; + struct ip_conntrack *conntrack; + struct ct_holder *c; + struct ip_conntrack_tuple orig, repl; + + c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); + if (c == NULL) + return -ENOMEM; + + if (decode_tuple(&ci->cpt_tuple[0], &orig, 0, ctx) || + decode_tuple(&ci->cpt_tuple[1], &repl, 1, ctx)) { + kfree(c); + return -EINVAL; + } + + conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub); + if (!conntrack || IS_ERR(conntrack)) { + kfree(c); + return -ENOMEM; + } + + c->ct = conntrack; + c->next = *ct_list; + *ct_list = c; + c->index = ci->cpt_index; + + conntrack->status = ci->cpt_status; + + memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto)); + memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help)); + +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + conntrack->mark = ci->cpt_mark; +#endif + +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + conntrack->nat.masq_index = ci->cpt_masq_index; +#endif + if (ci->cpt_initialized) { + conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos; + conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before; + conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after; + conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos; + conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before; + conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after; + } + if (conntrack->status & IPS_NAT_DONE_MASK) + ip_nat_hash_conntrack(conntrack); +#endif + + if (ci->cpt_ct_helper) { + conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple); + if (conntrack->helper == NULL) { + eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n"); + err = -EINVAL; + } + } + + ip_conntrack_hash_insert(conntrack); + conntrack->timeout.expires = jiffies + ci->cpt_timeout; + + if (err == 0 && ci->cpt_next > ci->cpt_hdrlen) + err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx); + + if (conntrack->helper) + ip_conntrack_helper_put(conntrack->helper); + + return err; +} + +static void convert_conntrack_image(struct cpt_ip_conntrack_image *ci) +{ + struct cpt_ip_conntrack_image_compat img; + + memcpy(&img, ci, sizeof(struct cpt_ip_conntrack_image_compat)); + /* + * Size of cpt_help_data in 2.6.9 kernel is 16 bytes, + * in 2.6.18 cpt_help_data size is 24 bytes, so zero the rest 8 bytes + */ + memset(ci->cpt_help_data + 4, 0, 8); + ci->cpt_initialized = img.cpt_initialized; + ci->cpt_num_manips = img.cpt_num_manips; + memcpy(ci->cpt_nat_manips, img.cpt_nat_manips, sizeof(img.cpt_nat_manips)); + memcpy(ci->cpt_nat_seq, img.cpt_nat_seq, sizeof(img.cpt_nat_seq)); + ci->cpt_masq_index = img.cpt_masq_index; + /* Id will be assigned in ip_conntrack_hash_insert(), so make it 0 here */ + ci->cpt_id = 0; + /* mark was not supported in 2.6.9, so set it to default 0 value */ + ci->cpt_mark = 0; + +} + +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + int err = 0; + loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_ip_conntrack_image ci; + struct ct_holder *c; + struct ct_holder *ct_list = NULL; + + if (sec == CPT_NULL) + return 0; + + if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) { + eprintk_ctx("conntrack module ct->proto version mismatch\n"); + return -EINVAL; + } + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx); + if (err) + break; + if (ctx->image_version < CPT_VERSION_16) + convert_conntrack_image(&ci); + err = undump_one_ct(&ci, sec, &ct_list, ctx); + if (err) + break; + sec += ci.cpt_next; + } + + while ((c = ct_list) != NULL) { + ct_list = c->next; + if (c->ct) + add_timer(&c->ct->timeout); + kfree(c); + } + + return err; +} + +#else + +#include "cpt_obj.h" +#include "cpt_context.h" + +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL) + return -EINVAL; + return 0; +} + +#endif diff -urNp linux-2.6.32.48/kernel/cpt/rst_context.c linux-2.6.32.48-openvz/kernel/cpt/rst_context.c --- linux-2.6.32.48/kernel/cpt/rst_context.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_context.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,331 @@ +/* + * + * kernel/cpt/rst_context.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" + +static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->read(file, addr, count, &file->f_pos); + set_fs(oldfs); + if (err != count) + return err >= 0 ? -EIO : err; + return 0; +} + +static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->read(file, addr, count, &pos); + set_fs(oldfs); + if (err != count) + return err >= 0 ? -EIO : err; + return 0; +} + +static void file_align(struct cpt_context *ctx) +{ + struct file *file = ctx->file; + + if (file) + file->f_pos = CPT_ALIGN(file->f_pos); +} + +int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end) +{ + struct cpt_section_hdr hdr; + int err; + loff_t pos; + + pos = ctx->sections[type]; + *start = *end = pos; + + if (pos != CPT_NULL) { + if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0) + return err; + if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr)) + return -EINVAL; + *start = pos + hdr.cpt_hdrlen; + *end = pos + hdr.cpt_next; + } + return 0; +} +EXPORT_SYMBOL(rst_get_section); + +void rst_context_init(struct cpt_context *ctx) +{ + int i; + + memset(ctx, 0, sizeof(*ctx)); + + init_MUTEX(&ctx->main_sem); + ctx->refcount = 1; + + ctx->current_section = -1; + ctx->current_object = -1; + ctx->pagesize = PAGE_SIZE; + ctx->read = file_read; + ctx->pread = file_pread; + ctx->align = file_align; + for (i=0; i < CPT_SECT_MAX; i++) + ctx->sections[i] = CPT_NULL; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + init_completion(&ctx->pgin_notify); +#endif + cpt_object_init(ctx); +} + +static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx) +{ + struct cpt_section_hdr h; + + while (start < end) { + int err; + + err = ctx->pread(&h, sizeof(h), ctx, start); + if (err) + return err; + if (h.cpt_hdrlen < sizeof(h) || + h.cpt_next < h.cpt_hdrlen || + start + h.cpt_next > end) + return -EINVAL; + if (h.cpt_section >= CPT_SECT_MAX) + return -EINVAL; + ctx->sections[h.cpt_section] = start; + start += h.cpt_next; + } + return 0; +} + +int rst_open_dumpfile(struct cpt_context *ctx) +{ + int err; + struct cpt_major_tail *v; + struct cpt_major_hdr h; + unsigned long size; + + err = -EBADF; + if (!ctx->file) + goto err_out; + + err = -ENOMEM; + ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); + if (ctx->tmpbuf == NULL) + goto err_out; + __cpt_release_buf(ctx); + + size = ctx->file->f_dentry->d_inode->i_size; + + if (size & 7) { + err = -EINVAL; + goto err_out; + } + if (size < sizeof(struct cpt_major_hdr) + + sizeof(struct cpt_major_tail)) { + err = -EINVAL; + goto err_out; + } + err = ctx->pread(&h, sizeof(h), ctx, 0); + if (err) { + eprintk_ctx("too short image 1 %d\n", err); + goto err_out; + } + if (h.cpt_signature[0] != CPT_SIGNATURE0 || + h.cpt_signature[1] != CPT_SIGNATURE1 || + h.cpt_signature[2] != CPT_SIGNATURE2 || + h.cpt_signature[3] != CPT_SIGNATURE3) { + err = -EINVAL; + goto err_out; + } + if (h.cpt_hz != HZ) { + err = -EINVAL; + eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ); + goto err_out; + } + ctx->virt_jiffies64 = h.cpt_start_jiffies64; + ctx->start_time.tv_sec = h.cpt_start_sec; + ctx->start_time.tv_nsec = h.cpt_start_nsec; + ctx->kernel_config_flags = h.cpt_kernel_config[0]; + ctx->iptables_mask = h.cpt_iptables_mask; + if (h.cpt_image_version > CPT_CURRENT_VERSION || + CPT_VERSION_MINOR(h.cpt_image_version) > + CPT_VERSION_MINOR(CPT_CURRENT_VERSION)) { + eprintk_ctx("Unknown image version: %x. Can't restore.\n", + h.cpt_image_version); + err = -EINVAL; + goto err_out; + } + ctx->image_version = h.cpt_image_version; + ctx->features = (__u64)((__u64)h.cpt_ve_features2<<32 | h.cpt_ve_features); + ctx->image_arch = h.cpt_os_arch; + + v = cpt_get_buf(ctx); + err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v)); + if (err) { + eprintk_ctx("too short image 2 %d\n", err); + cpt_release_buf(ctx); + goto err_out; + } + if (v->cpt_signature[0] != CPT_SIGNATURE0 || + v->cpt_signature[1] != CPT_SIGNATURE1 || + v->cpt_signature[2] != CPT_SIGNATURE2 || + v->cpt_signature[3] != CPT_SIGNATURE3 || + v->cpt_nsect != CPT_SECT_MAX_INDEX) { + err = -EINVAL; + cpt_release_buf(ctx); + goto err_out; + } + if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) { + cpt_release_buf(ctx); + goto err_out; + } +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + ctx->lazypages = v->cpt_lazypages; +#endif + ctx->tasks64 = v->cpt_64bit; + cpt_release_buf(ctx); + return 0; + +err_out: + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } + return err; +} + +void rst_close_dumpfile(struct cpt_context *ctx) +{ + if (ctx->file) { + fput(ctx->file); + ctx->file = NULL; + } + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } +} + +int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx) +{ + int err; + struct cpt_object_hdr *hdr = tmp; + err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos); + if (err) + return err; + if (type > 0 && type != hdr->cpt_object) + return -EINVAL; + if (hdr->cpt_hdrlen > hdr->cpt_next) + return -EINVAL; + if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr)) + return -EINVAL; + if (size < sizeof(*hdr)) + return -EINVAL; + if (size > hdr->cpt_hdrlen) + size = hdr->cpt_hdrlen; + if (size > sizeof(*hdr)) + err = ctx->pread(hdr+1, size - sizeof(*hdr), + ctx, pos + sizeof(*hdr)); + return err; +} +EXPORT_SYMBOL(_rst_get_object); + +void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx) +{ + int err; + void *tmp; + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(hdr), ctx, pos); + if (err) + return NULL; + if (type > 0 && type != hdr.cpt_object) + return NULL; + if (hdr.cpt_hdrlen > hdr.cpt_next) + return NULL; + if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr)) + return NULL; + tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL); + if (!tmp) + return NULL; + err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos); + if (!err) + return tmp; + kfree(tmp); + return NULL; +} +EXPORT_SYMBOL(__rst_get_object); + +__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx) +{ + int err; + struct cpt_object_hdr hdr; + __u8 *name; + + err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx); + if (err) + return NULL; + if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE) + return NULL; + name = (void*)__get_free_page(GFP_KERNEL); + if (!name) + return NULL; + err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen, + ctx, *pos_p + hdr.cpt_hdrlen); + if (err) { + free_page((unsigned long)name); + return NULL; + } + *pos_p += hdr.cpt_next; + return name; +} + +__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx) +{ + return __rst_get_name(&pos, ctx); +} + +void rst_put_name(__u8 *name, struct cpt_context *ctx) +{ + unsigned long addr = (unsigned long)name; + + if (addr) + free_page(addr&~(PAGE_SIZE-1)); +} + +struct rst_ops rst_ops = { + .get_object = _rst_get_object, + .rst_file = rst_file, +}; diff -urNp linux-2.6.32.48/kernel/cpt/rst_epoll.c linux-2.6.32.48-openvz/kernel/cpt/rst_epoll.c --- linux-2.6.32.48/kernel/cpt/rst_epoll.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_epoll.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,169 @@ +/* + * + * kernel/cpt/rst_epoll.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +/* Those funcations are static in fs/eventpoll.c */ +extern int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd); +extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); +extern void ep_release_epitem(struct epitem *epi); + + +struct file *cpt_open_epolldev(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + struct file *file; + int efd; + + /* Argument "size" is ignored, use just 1 */ + efd = sys_epoll_create(1); + if (efd < 0) + return ERR_PTR(efd); + + file = fget(efd); + sys_close(efd); + return file; +} + +static int restore_one_epoll(cpt_object_t *obj, + loff_t pos, + struct cpt_epoll_image *ebuf, + cpt_context_t *ctx) +{ + int err = 0; + loff_t endpos; + struct file *file = obj->o_obj; + struct eventpoll *ep; + + if (file->f_op != &eventpoll_fops) { + eprintk_ctx("bad epoll file\n"); + return -EINVAL; + } + + ep = file->private_data; + + if (unlikely(ep == NULL)) { + eprintk_ctx("bad epoll device\n"); + return -EINVAL; + } + + endpos = pos + ebuf->cpt_next; + pos += ebuf->cpt_hdrlen; + while (pos < endpos) { + struct cpt_epoll_file_image efi; + struct epoll_event epds; + + cpt_object_t *tobj; + + err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx); + if (err) + return err; + tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx); + if (!tobj) { + eprintk_ctx("epoll file not found\n"); + return -EINVAL; + } + epds.events = efi.cpt_events; + epds.data = efi.cpt_data; + mutex_lock(&ep->mtx); + err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd); + if (!err) { + struct epitem *epi; + epi = ep_find(ep, tobj->o_obj, efi.cpt_fd); + if (epi) { + if (efi.cpt_ready) { + unsigned long flags; + spin_lock_irqsave(&ep->lock, flags); + if (list_empty(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); + spin_unlock_irqrestore(&ep->lock, flags); + } + } + } + mutex_unlock(&ep->mtx); + if (err) + break; + pos += efi.cpt_next; + } + return err; +} + +int rst_eventpoll(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_EPOLL]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_epoll_image *ebuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx); + if (obj == NULL) { + eprintk_ctx("cannot find epoll file object\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + err = restore_one_epoll(obj, sec, ebuf, ctx); + cpt_release_buf(ctx); + if (err) + return err; + sec += ebuf->cpt_next; + } + + return 0; + +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_files.c linux-2.6.32.48-openvz/kernel/cpt/rst_files.c --- linux-2.6.32.48/kernel/cpt/rst_files.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_files.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,1799 @@ +/* + * + * kernel/cpt/rst_files.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" + +#include "cpt_syscalls.h" + + +struct filejob { + struct filejob *next; + int pid; + loff_t fdi; +}; + +static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx) +{ + struct filejob *j; + + j = kmalloc(sizeof(*j), GFP_KERNEL); + if (j == NULL) + return -ENOMEM; + j->pid = current->pid; + j->fdi = pos; + j->next = ctx->filejob_queue; + ctx->filejob_queue = j; + return 0; +} + +static void _anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + /* + * If nobody else uses this page, and we don't already have a + * temporary page, let's keep track of it as a one-deep + * allocation cache. (Otherwise just release our reference to it) + */ + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); + + module_put(THIS_MODULE); +} + +static void *_anon_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) +{ + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); + } + + return kmap(buf->page); +} + +static void _anon_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) +{ + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); +} + +static int _anon_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + if (page_count(page) == 1) { + lock_page(page); + return 0; + } + + return 1; +} + +static void _anon_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + +static int _anon_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + return 0; +} + +static struct pipe_buf_operations _anon_pipe_buf_ops = { + .can_merge = 1, + .map = _anon_pipe_buf_map, + .unmap = _anon_pipe_buf_unmap, + .release = _anon_pipe_buf_release, + .confirm = _anon_pipe_buf_confirm, + .get = _anon_pipe_buf_get, + .steal = _anon_pipe_buf_steal, +}; + +/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer + * many times. We need to mark it in CPT_OBJ_INODE table in some way. + */ +static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi, + struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + struct cpt_inode_image ii; + struct cpt_obj_bits b; + struct pipe_inode_info *info; + int err; + int count; + + if (!S_ISFIFO(ino->i_mode)) { + eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", (long long)fi->cpt_inode); + return -EINVAL; + } + if (fi->cpt_inode == CPT_NULL) + return 0; + + err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); + if (err) + return err; + + if (ii.cpt_next <= ii.cpt_hdrlen) + return 0; + + err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx); + if (err) + return err; + + if (b.cpt_size == 0) + return 0; + + mutex_lock(&ino->i_mutex); + info = ino->i_pipe; + if (info->nrbufs) { + mutex_unlock(&ino->i_mutex); + eprintk("pipe buffer is restored already\n"); + return -EINVAL; + } + info->curbuf = 0; + count = 0; + while (count < b.cpt_size) { + struct pipe_buffer *buf = info->bufs + info->nrbufs; + void * addr; + int chars; + + chars = b.cpt_size - count; + if (chars > PAGE_SIZE) + chars = PAGE_SIZE; + if (!try_module_get(THIS_MODULE)) { + err = -EBUSY; + break; + } + + buf->page = alloc_page(GFP_HIGHUSER); + if (buf->page == NULL) { + err = -ENOMEM; + break; + } + buf->ops = &_anon_pipe_buf_ops; + buf->offset = 0; + buf->len = chars; + info->nrbufs++; + addr = kmap(buf->page); + err = ctx->pread(addr, chars, ctx, + fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count); + if (err) + break; + count += chars; + } + mutex_unlock(&ino->i_mutex); + + return err; +} + +static int make_flags(struct cpt_file_image *fi) +{ + int flags = O_NOFOLLOW; + switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + flags |= O_RDWR; break; + case FMODE_WRITE: + flags |= O_WRONLY; break; + case FMODE_READ: + flags |= O_RDONLY; break; + default: break; + } + flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC); + flags |= O_NONBLOCK|O_NOCTTY; + return flags; +} + +static struct file *open_pipe(char *name, + struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct cpt_inode_image ii; + struct file *rf, *wf; + + err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); + if (err) + return ERR_PTR(err); + + if (ii.cpt_sb == FSMAGIC_PIPEFS) { + int pfd[2]; + + if ((err = sc_pipe(pfd)) < 0) + return ERR_PTR(err); + + rf = fcheck(pfd[0]); + wf = fcheck(pfd[1]); + get_file(rf); + get_file(wf); + sc_close(pfd[0]); + sc_close(pfd[1]); + + if (fi->cpt_mode&FMODE_READ) { + struct file *tf; + tf = wf; wf = rf; rf = tf; + } + } else { + if (fi->cpt_mode&FMODE_READ) { + rf = filp_open(name, flags, 0); + if (IS_ERR(rf)) { + dprintk_ctx("filp_open\n"); + return rf; + } + dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current), + (long long)fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode); + return rf; + } + + dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), (long long)fi->cpt_inode); + + rf = filp_open(name, O_RDWR|O_NONBLOCK, 0); + if (IS_ERR(rf)) + return rf; + wf = dentry_open(dget(rf->f_dentry), + mntget(rf->f_vfsmnt), flags, NULL); + } + + /* Add pipe inode to obj table. */ + obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx); + if (obj == NULL) { + fput(rf); fput(wf); + return ERR_PTR(-ENOMEM); + } + cpt_obj_setpos(obj, fi->cpt_inode, ctx); + obj->o_parent = rf; + + /* Add another side of pipe to obj table, it will not be used + * (o_pos = PT_NULL), another processes opeining pipe will find + * inode and open it with dentry_open(). */ + obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx); + if (obj == NULL) { + fput(wf); + return ERR_PTR(-ENOMEM); + } + return wf; +} + +static struct file *open_special(struct cpt_file_image *fi, + unsigned flags, + int deleted, + struct cpt_context *ctx) +{ + struct cpt_inode_image *ii; + struct file *file; + + /* Directories and named pipes are not special actually */ + if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode)) + return NULL; + + /* No support for block devices at the moment. */ + if (S_ISBLK(fi->cpt_i_mode)) + return ERR_PTR(-EINVAL); + + if (S_ISSOCK(fi->cpt_i_mode)) { + eprintk_ctx("bug: socket is not open\n"); + return ERR_PTR(-EINVAL); + } + + /* Support only (some) character devices at the moment. */ + if (!S_ISCHR(fi->cpt_i_mode)) + return ERR_PTR(-EINVAL); + + ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); + if (ii == NULL) + return ERR_PTR(-ENOMEM); + + /* Do not worry about this right now. /dev/null,zero,*random are here. + * To prohibit at least /dev/mem? + */ + if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) { + kfree(ii); + return NULL; + } + + /* /dev/net/tun will be opened by caller */ + if (fi->cpt_lflags & CPT_DENTRY_TUNTAP) { + kfree(ii); + return NULL; + } + + file = rst_open_tty(fi, ii, flags, ctx); + kfree(ii); + return file; +} + +static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx) +{ + struct file_lock lock; + cpt_object_t *obj; + + memset(&lock, 0, sizeof(lock)); + lock.fl_type = fli->cpt_type; + lock.fl_flags = fli->cpt_flags & ~FL_SLEEP; + lock.fl_start = fli->cpt_start; + lock.fl_end = fli->cpt_end; + obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx); + if (!obj) { + eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner); + return -EINVAL; + } + lock.fl_owner = obj->o_obj; + lock.fl_pid = vpid_to_pid(fli->cpt_pid); + if (lock.fl_pid < 0) { + eprintk_ctx("unknown lock pid %d\n", lock.fl_pid); + return -EINVAL; + } + lock.fl_file = file; + + if (lock.fl_owner == NULL) + eprintk_ctx("no lock owner\n"); + return posix_lock_file(file, &lock, NULL); +} + +static int restore_flock(struct file *file, struct cpt_flock_image *fli, + cpt_context_t *ctx) +{ + int cmd, err, fd; + fd = get_unused_fd(); + if (fd < 0) { + eprintk_ctx("BSD flock cannot be restored\n"); + return fd; + } + get_file(file); + fd_install(fd, file); + if (fli->cpt_type == F_RDLCK) { + cmd = LOCK_SH; + } else if (fli->cpt_type == F_WRLCK) { + cmd = LOCK_EX; + } else { + eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type); + sc_close(fd); + return -EINVAL; + } + + err = sc_flock(fd, LOCK_NB | cmd); + sc_close(fd); + return err; +} + + +static int fixup_posix_locks(struct file *file, + struct cpt_file_image *fi, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end; + struct cpt_flock_image fli; + + end = pos + fi->cpt_next; + pos += fi->cpt_hdrlen; + while (pos < end) { + err = rst_get_object(-1, pos, &fli, ctx); + if (err) + return err; + if (fli.cpt_object == CPT_OBJ_FLOCK && + (fli.cpt_flags&FL_POSIX)) { + err = restore_posix_lock(file, &fli, ctx); + if (err) + return err; + dprintk_ctx("posix lock restored\n"); + } + pos += fli.cpt_next; + } + return 0; +} + +int rst_posix_locks(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + struct cpt_file_image fi; + + if (obj->o_pos == CPT_NULL) + continue; + + err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx); + if (err < 0) + return err; + if (fi.cpt_next > fi.cpt_hdrlen) + fixup_posix_locks(file, &fi, obj->o_pos, ctx); + } + return 0; +} + +static int fixup_flocks(struct file *file, + struct cpt_file_image *fi, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end; + struct cpt_flock_image fli; + + end = pos + fi->cpt_next; + pos += fi->cpt_hdrlen; + while (pos < end) { + err = rst_get_object(-1, pos, &fli, ctx); + if (err) + return err; + if (fli.cpt_object == CPT_OBJ_FLOCK && + (fli.cpt_flags&FL_FLOCK)) { + err = restore_flock(file, &fli, ctx); + if (err) + return err; + dprintk_ctx("bsd lock restored\n"); + } + pos += fli.cpt_next; + } + return 0; +} + + +static int fixup_reg_data(struct file *file, loff_t pos, loff_t end, + struct cpt_context *ctx) +{ + int err; + struct cpt_page_block pgb; + ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); + + do_write = file->f_op->write; + if (do_write == NULL) { + eprintk_ctx("no write method. Cannot restore contents of the file.\n"); + return -EINVAL; + } + + atomic_long_inc(&file->f_count); + + while (pos < end) { + loff_t opos; + loff_t ipos; + int count; + + err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); + if (err) + goto out; + dprintk_ctx("restoring file data block: %08x-%08x\n", + (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); + ipos = pos + pgb.cpt_hdrlen; + opos = pgb.cpt_start; + count = pgb.cpt_end-pgb.cpt_start; + while (count > 0) { + mm_segment_t oldfs; + int copy = count; + + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + oldfs = get_fs(); set_fs(KERNEL_DS); + err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); + set_fs(oldfs); + if (err) { + __cpt_release_buf(ctx); + goto out; + } + if (!(file->f_mode & FMODE_WRITE) || + (file->f_flags&O_DIRECT)) { + fput(file); + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), + O_WRONLY | O_LARGEFILE, NULL); + if (IS_ERR(file)) { + __cpt_release_buf(ctx); + return PTR_ERR(file); + } + } + oldfs = get_fs(); set_fs(KERNEL_DS); + ipos += copy; + err = do_write(file, ctx->tmpbuf, copy, &opos); + set_fs(oldfs); + __cpt_release_buf(ctx); + if (err != copy) { + if (err >= 0) + err = -EIO; + goto out; + } + count -= copy; + } + pos += pgb.cpt_next; + } + err = 0; + +out: + fput(file); + return err; +} + + +static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi, + struct cpt_inode_image *ii, + struct cpt_context *ctx) +{ + int err; + struct file *file = *file_p; + struct iattr newattrs; + + if (!S_ISREG(fi->cpt_i_mode)) + return 0; + + if (file == NULL) { + file = shmem_file_setup("dev/zero", ii->cpt_size, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + *file_p = file; + } + + if (ii->cpt_next > ii->cpt_hdrlen) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ctx, fi->cpt_inode+ii->cpt_hdrlen); + if (err) + return err; + if (hdr.cpt_object == CPT_OBJ_PAGES) { + err = fixup_reg_data(file, fi->cpt_inode+ii->cpt_hdrlen, + fi->cpt_inode+ii->cpt_next, ctx); + if (err) + return err; + } + } + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + /* stage 1 - update size like do_truncate does */ + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + newattrs.ia_size = ii->cpt_size; + cpt_timespec_import(&newattrs.ia_ctime, ii->cpt_ctime); + err = notify_change(file->f_dentry, &newattrs); + if (err) + goto out; + + /* stage 2 - update times, owner and mode */ + newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_ATIME_SET | ATTR_MTIME_SET | + ATTR_MODE | ATTR_UID | ATTR_GID; + newattrs.ia_uid = ii->cpt_uid; + newattrs.ia_gid = ii->cpt_gid; + newattrs.ia_mode = file->f_dentry->d_inode->i_mode & S_IFMT; + newattrs.ia_mode |= (ii->cpt_mode & ~S_IFMT); + cpt_timespec_import(&newattrs.ia_atime, ii->cpt_atime); + cpt_timespec_import(&newattrs.ia_mtime, ii->cpt_mtime); + err = notify_change(file->f_dentry, &newattrs); + +out: + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return err; +} + +static int fixup_file_flags(struct file *file, const struct cred *cred, + struct cpt_file_image *fi, + int was_dentry_open, loff_t pos, + cpt_context_t *ctx) +{ + if (fi->cpt_pos != file->f_pos) { + int err = -ESPIPE; + if (file->f_op->llseek) + err = file->f_op->llseek(file, fi->cpt_pos, 0); + if (err < 0) { + dprintk_ctx("file %Ld lseek %Ld - %Ld\n", + (long long)pos, + (long long)file->f_pos, + (long long)fi->cpt_pos); + file->f_pos = fi->cpt_pos; + } + } + + if (cred->uid != fi->cpt_uid || cred->gid != fi->cpt_gid) + wprintk_ctx("fixup_file_flags: oops... creds mismatch\n"); + + /* + * this is wrong. but with current cpt_file_image there's + * nothing we can do + */ + + put_cred(file->f_cred); + file->f_cred = get_cred(cred); + + file->f_owner.pid = 0; + if (fi->cpt_fown_pid != CPT_FOWN_STRAY_PID) { + file->f_owner.pid = find_get_pid(fi->cpt_fown_pid); + if (file->f_owner.pid == NULL) { + wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n", + fi->cpt_fown_pid); + return -EINVAL; + } + } + file->f_owner.uid = fi->cpt_fown_uid; + file->f_owner.euid = fi->cpt_fown_euid; + file->f_owner.signum = fi->cpt_fown_signo; + + if (file->f_mode != fi->cpt_mode) { + if (was_dentry_open && + ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) { + file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK); + file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK); + } + if (file->f_mode != fi->cpt_mode) + wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode); + } + if (file->f_flags != fi->cpt_flags) { + if (!(fi->cpt_flags&O_NOFOLLOW)) + file->f_flags &= ~O_NOFOLLOW; + if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) { + file->f_flags &= ~O_NONBLOCK; + file->f_flags |= fi->cpt_flags&O_NONBLOCK; + } + if (fi->cpt_flags&FASYNC) { + if (fi->cpt_fown_fd == -1) { + wprintk_ctx("No fd for FASYNC\n"); + return -EINVAL; + } else if (file->f_op && file->f_op->fasync) { + if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) { + wprintk_ctx("FASYNC problem\n"); + return -EINVAL; + } else { + file->f_flags |= FASYNC; + } + } + } + if (file->f_flags != fi->cpt_flags) { + eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags); + return -EINVAL; + } + } + return 0; +} + +static struct file * +open_deleted(char *name, unsigned flags, struct cpt_file_image *fi, + struct cpt_inode_image *ii, cpt_context_t *ctx) +{ + struct file * file; + char *suffix = NULL; + int attempt = 0; + int tmp_pass = 0; + mode_t mode = fi->cpt_i_mode; + + /* Strip (deleted) part... */ + if (strlen(name) > strlen(" (deleted)")) { + if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) { + suffix = &name[strlen(name) - strlen(" (deleted)")]; + *suffix = 0; + } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) { + memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1); + suffix = name + strlen(name); + } + } + +try_again: + for (;;) { + if (attempt) { + if (attempt > 1000) { + eprintk_ctx("open_deleted: failed after %d attempts\n", attempt); + return ERR_PTR(-EEXIST); + } + if (suffix == NULL) { + eprintk_ctx("open_deleted: no suffix\n"); + return ERR_PTR(-EEXIST); + } + sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt)); + } + attempt++; + + if (S_ISFIFO(mode)) { + int err; + err = sc_mknod(name, S_IFIFO|(mode&017777), 0); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = open_pipe(name, fi, flags, ctx); + sc_unlink(name); + } else if (S_ISCHR(mode)) { + int err; + err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev)); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = filp_open(name, flags, mode&017777); + sc_unlink(name); + } else if (S_ISDIR(mode)) { + int err; + err = sc_mkdir(name, mode&017777); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = filp_open(name, flags, mode&017777); + sc_rmdir(name); + } else { + file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777); + if (IS_ERR(file)) { + if (PTR_ERR(file) == -EEXIST) + continue; + if (!tmp_pass) + goto change_dir; + } else { + sc_unlink(name); + } + } + break; + } + + if (IS_ERR(file)) { + eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file)); + return file; + } else { + dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode); + } + return file; + +change_dir: + sprintf(name, "/tmp/rst%u", current->pid); + suffix = name + strlen(name); + attempt = 1; + tmp_pass = 1; + goto try_again; +} + +#ifdef CONFIG_SIGNALFD +static struct file *open_signalfd(struct cpt_file_image *fi, int flags, struct cpt_context *ctx) +{ + sigset_t mask; + mm_segment_t old_fs; + int fd; + struct file *file; + + cpt_sigset_import(&mask, fi->cpt_priv); + + old_fs = get_fs(); set_fs(KERNEL_DS); + fd = do_signalfd(-1, &mask, flags & (O_CLOEXEC | O_NONBLOCK)); + set_fs(old_fs); + + if (fd < 0) + return ERR_PTR(fd); + + file = fget(fd); + sys_close(fd); + + return file; +} +#else +static struct file *open_signalfd(struct cpt_file_image *fi, int flags, struct cpt_context *ctx) +{ + return ERR_PTR(-EINVAL); +} +#endif + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx) +{ + int err; + int was_dentry_open = 0; + cpt_object_t *obj; + cpt_object_t *iobj; + struct cpt_file_image fi; + __u8 *name = NULL; + struct file *file; + struct proc_dir_entry *proc_dead_file; + int flags; + const struct cred *cred_origin; + + /* + * It may happen that a process which created a file + * had changed its UID after that (keeping file opened/referenced + * with write permissions for 'own' only) as a result we might + * be unable to read it at restore time due to credentials + * mismatch, to break this tie we temporary take init_cred credentials + * and as only the file gets read into the memory we restore original + * credentials back + * + * Same time if between credentials rise/restore you need + * the former credentials (for fixups or whatever) -- + * use cred_origin for that + */ + + cred_origin = override_creds(&init_cred); + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx); + if (obj) { + file = obj->o_obj; + if (obj->o_index >= 0) { + dprintk_ctx("file is attached to a socket\n"); + err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); + if (err < 0) + goto err_out; + fixup_file_flags(file, cred_origin, &fi, 0, pos, ctx); + } + get_file(file); + revert_creds(cred_origin); + return file; + } + + err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); + if (err < 0) + goto err_out; + + flags = make_flags(&fi); + + /* Easy way, inode has been already open. */ + if (fi.cpt_inode != CPT_NULL && + !(fi.cpt_lflags & CPT_DENTRY_CLONING) && + (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL && + iobj->o_parent) { + struct file *filp = iobj->o_parent; + file = dentry_open(dget(filp->f_dentry), + mntget(filp->f_vfsmnt), flags, NULL); + dprintk_ctx("rst_file: file obtained by dentry_open\n"); + was_dentry_open = 1; + goto map_file; + } + + if (fi.cpt_next > fi.cpt_hdrlen) + name = rst_get_name(pos + sizeof(fi), ctx); + + if (!name) { + eprintk_ctx("no name for file?\n"); + err = -EINVAL; + goto err_out; + } + + if (fi.cpt_lflags & CPT_DENTRY_DELETED) { + struct cpt_inode_image ii; + if (fi.cpt_inode == CPT_NULL) { + eprintk_ctx("deleted file and no inode.\n"); + err = -EINVAL; + goto err_out; + } + + err = rst_get_object(CPT_OBJ_INODE, fi.cpt_inode, &ii, ctx); + if (err) + goto err_out; + + if (ii.cpt_next > ii.cpt_hdrlen) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(hdr), ctx, + fi.cpt_inode + ii.cpt_hdrlen); + if (err) + goto err_out; + if (hdr.cpt_object == CPT_OBJ_NAME) { + rst_put_name(name, ctx); + name = rst_get_name(fi.cpt_inode+ii.cpt_hdrlen, + ctx); + if (!name) { + eprintk_ctx("no name for link?\n"); + err = -EINVAL; + goto err_out; + } + if ((fi.cpt_lflags & CPT_DENTRY_HARDLINKED) && + !ctx->hardlinked_on) { + eprintk_ctx("Open hardlinked is off\n"); + err = -EPERM; + goto err_out; + } + goto open_file; + } + } + + /* One very special case... */ + if (S_ISREG(fi.cpt_i_mode) && + (!name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) { + /* MAP_ANON|MAP_SHARED mapping. + * kernel makes this damn ugly way, when file which + * is passed to mmap by user does not match + * file finally attached to VMA. Ok, rst_mm + * has to take care of this. Otherwise, it will fail. + */ + file = NULL; + } else if (S_ISREG(fi.cpt_i_mode) || + S_ISCHR(fi.cpt_i_mode) || + S_ISFIFO(fi.cpt_i_mode) || + S_ISDIR(fi.cpt_i_mode)) { + if (S_ISCHR(fi.cpt_i_mode)) { + file = open_special(&fi, flags, 1, ctx); + if (file != NULL) + goto map_file; + } + file = open_deleted(name, flags, &fi, &ii, ctx); + if (IS_ERR(file)) + goto out; + } else { + eprintk_ctx("not a regular deleted file.\n"); + err = -EINVAL; + goto err_out; + } + + err = fixup_file_content(&file, &fi, &ii, ctx); + if (err) + goto err_put; + goto map_file; + } else { +open_file: + if (!name[0]) { + eprintk_ctx("empty name for file?\n"); + err = -EINVAL; + goto err_out; + } + if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) && + (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL) + goto map_file; +#ifdef CONFIG_INOTIFY_USER + if ((fi.cpt_lflags & CPT_DENTRY_INOTIFY) && + (file = rst_open_inotify(&fi, flags, ctx)) != NULL) + goto map_file; +#else + if (fi.cpt_lflags & CPT_DENTRY_INOTIFY) { + err = -EINVAL; + goto err_out; + } +#endif + if ((fi.cpt_lflags & CPT_DENTRY_SIGNALFD) && + (file = open_signalfd(&fi, flags, ctx)) != NULL) + goto map_file; + if (S_ISFIFO(fi.cpt_i_mode) && + (file = open_pipe(name, &fi, flags, ctx)) != NULL) + goto map_file; + if (!S_ISREG(fi.cpt_i_mode) && + (file = open_special(&fi, flags, 0, ctx)) != NULL) + goto map_file; + } + + /* This hook is needed to open file /proc// + * but there is no proccess with pid . + */ + proc_dead_file = NULL; + if (fi.cpt_lflags & CPT_DENTRY_PROCPID_DEAD) { + sprintf(name, "/proc/rst_dead_pid_file_%d", task_pid_vnr(current)); + + proc_dead_file = create_proc_entry(name + 6, S_IRUGO|S_IWUGO, + NULL); + if (!proc_dead_file) { + eprintk_ctx("can't create proc entry %s\n", name); + err = -ENOMEM; + goto err_out; + } +#ifdef CONFIG_PROC_FS + proc_dead_file->proc_fops = &dummy_proc_pid_file_operations; +#endif + } + + file = filp_open(name, flags, 0); + + if (proc_dead_file) { + remove_proc_entry(proc_dead_file->name, NULL); + if (!IS_ERR(file)) + d_drop(file->f_dentry); + } +map_file: + if (!IS_ERR(file)) { + fixup_file_flags(file, cred_origin, &fi, was_dentry_open, pos, ctx); + + if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) { + err = fixup_pipe_data(file, &fi, ctx); + if (err) + goto err_put; + } + + /* This is very special hack. Logically, cwd/root are + * nothing but open directories. Nevertheless, this causes + * failures of restores, when number of open files in VE + * is close to limit. So, if it is rst_file() of cwd/root + * (fd = -2) and the directory is not deleted, we skip + * adding files to object table. If the directory is + * not unlinked, this cannot cause any problems. + */ + if (fd != -2 || + !S_ISDIR(file->f_dentry->d_inode->i_mode) || + (fi.cpt_lflags & CPT_DENTRY_DELETED)) { + obj = cpt_object_get(CPT_OBJ_FILE, file, ctx); + if (!obj) { + obj = cpt_object_add(CPT_OBJ_FILE, file, ctx); + if (obj) + get_file(file); + } + if (obj) + cpt_obj_setpos(obj, pos, ctx); + + obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (obj) { + cpt_obj_setpos(obj, fi.cpt_inode, ctx); + if (!obj->o_parent || !(fi.cpt_lflags & CPT_DENTRY_DELETED)) + obj->o_parent = file; + } + } + + if (fi.cpt_next > fi.cpt_hdrlen) { + err = fixup_flocks(file, &fi, pos, ctx); + if (err) + goto err_put; + } + } else { + if ((fi.cpt_lflags & CPT_DENTRY_PROC) && + !(fi.cpt_lflags & CPT_DENTRY_PROCPID_DEAD)) { + dprintk_ctx("rst_file /proc delayed\n"); + file = NULL; + } else if (name) + eprintk_ctx("can't open file %s\n", name); + } + +out: + if (name) + rst_put_name(name, ctx); + revert_creds(cred_origin); + return file; + +err_put: + if (file) + fput(file); +err_out: + if (name) + rst_put_name(name, ctx); + revert_creds(cred_origin); + return ERR_PTR(err); +} + + +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + + if (ti->cpt_files == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx)) + flag |= CLONE_FILES; + if (ti->cpt_fs == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx)) + flag |= CLONE_FS; + return flag; +} + +static void local_close_files(struct files_struct * files) +{ + int i, j; + + j = 0; + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= files->fdt->max_fds) + break; + set = files->fdt->open_fds->fds_bits[j]; + while (set) { + if (set & 1) { + struct file * file = xchg(&files->fdt->fd[i], NULL); + if (file) + filp_close(file, files); + } + i++; + set >>= 1; + } + files->fdt->open_fds->fds_bits[j] = 0; + files->fdt->close_on_exec->fds_bits[j] = 0; + j++; + } +} + +extern int expand_fdtable(struct files_struct *files, int nr); + + +static int rst_files(struct cpt_task_image *ti, struct cpt_context *ctx, + int from, int to) +{ + struct cpt_files_struct_image fi; + struct files_struct *f = current->files; + cpt_object_t *obj; + loff_t pos, endpos; + int err; + + if (ti->cpt_files == CPT_NULL) { + current->files = NULL; + if (f) + put_files_struct(f); + return 0; + } + + if (from == 3) { + err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); + if (err) + return err; + + goto just_do_it; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx); + if (obj) { + if (obj->o_obj != f) { + put_files_struct(f); + f = obj->o_obj; + atomic_inc(&f->count); + current->files = f; + } + return 0; + } + + err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); + if (err) + return err; + + local_close_files(f); + + if (fi.cpt_max_fds > f->fdt->max_fds) { + spin_lock(&f->file_lock); + err = expand_fdtable(f, fi.cpt_max_fds-1); + spin_unlock(&f->file_lock); + if (err < 0) + return err; + } + +just_do_it: + pos = ti->cpt_files + fi.cpt_hdrlen; + endpos = ti->cpt_files + fi.cpt_next; + while (pos < endpos) { + struct cpt_fd_image fdi; + struct file *filp; + + err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx); + if (err) + return err; + if (fdi.cpt_fd < from || fdi.cpt_fd > to) + goto skip; + + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), + (long long)fdi.cpt_file); + return PTR_ERR(filp); + } + if (filp == NULL) { + int err = rst_filejob_queue(pos, ctx); + if (err) + return err; + } else { + if (fdi.cpt_fd >= f->fdt->max_fds) BUG(); + f->fdt->fd[fdi.cpt_fd] = filp; + FD_SET(fdi.cpt_fd, f->fdt->open_fds); + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, f->fdt->close_on_exec); + } + +skip: + pos += fdi.cpt_next; + } + f->next_fd = fi.cpt_next_fd; + + obj = cpt_object_add(CPT_OBJ_FILES, f, ctx); + if (obj) { + cpt_obj_setpos(obj, ti->cpt_files, ctx); + cpt_obj_setindex(obj, fi.cpt_index, ctx); + } + return 0; +} + +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + return rst_files(ti, ctx, (ti->cpt_pid == 1) ? 3 : 0, INT_MAX); +} + +int rst_files_std(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + return rst_files(ti, ctx, 0, 2); +} + +int rst_do_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; + + while ((j = ctx->filejob_queue) != NULL) { + int err; + struct task_struct *tsk; + struct cpt_fd_image fdi; + struct file *filp; + + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(j->pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (!tsk) + return -EINVAL; + + err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx); + if (err) { + put_task_struct(tsk); + return err; + } + + if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); + if (tsk->files->fdt->fd[fdi.cpt_fd] || + FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) { + eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi); + put_task_struct(tsk); + return -EBUSY; + } + + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), (unsigned long long)fdi.cpt_file); + put_task_struct(tsk); + return PTR_ERR(filp); + } + if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); + tsk->files->fdt->fd[fdi.cpt_fd] = filp; + FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds); + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec); + + dprintk_ctx("filejob %Ld done\n", j->fdi); + + put_task_struct(tsk); + ctx->filejob_queue = j->next; + kfree(j); + } + return 0; +} + +void rst_flush_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; + + while ((j = ctx->filejob_queue) != NULL) { + ctx->filejob_queue = j->next; + kfree(j); + } +} + +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct fs_struct *f = current->fs; + cpt_object_t *obj; + + if (ti->cpt_fs == CPT_NULL) { + exit_fs(current); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx); + if (obj) { + if (obj->o_obj != f) { + exit_fs(current); + f = obj->o_obj; + write_lock(&f->lock); + f->users++; + write_unlock(&f->lock); + current->fs = f; + } + return 0; + } + + /* Do _not_ restore root. Image contains absolute pathnames. + * So, we fix it in context of rst process. + */ + + obj = cpt_object_add(CPT_OBJ_FS, f, ctx); + if (obj) + cpt_obj_setpos(obj, ti->cpt_fs, ctx); + + return 0; +} + +int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp, + loff_t *pos, struct cpt_context *ctx) +{ + struct cpt_file_image fi; + struct file * file; + int err; + + err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx); + if (err) + return err; + + file = rst_file(*pos, -2, ctx); + if (IS_ERR(file)) { + if (PTR_ERR(file) == -EINVAL && S_ISLNK(fi.cpt_i_mode)) { + /* One special case: inotify on symlink */ + struct nameidata nd; + __u8 *name = NULL; + + if (fi.cpt_next > fi.cpt_hdrlen) + name = rst_get_name(*pos + sizeof(fi), ctx); + if (!name) { + eprintk_ctx("can't get name for file\n"); + return -EINVAL; + } + if ((err = path_lookup(name, 0, &nd)) != 0) { + eprintk_ctx("path_lookup %s: %d\n", name, err); + rst_put_name(name, ctx); + return -EINVAL; + } + *dp = nd.path.dentry; + *mp = nd.path.mnt; + *pos += fi.cpt_next; + rst_put_name(name, ctx); + return 0; + } + return PTR_ERR(file); + } + + *dp = dget(file->f_dentry); + *mp = mntget(file->f_vfsmnt); + *pos += fi.cpt_next; + fput(file); + return 0; +} + +static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct dentry *old_root; + struct vfsmount *old_rootmnt; + write_lock(&fs->lock); + old_root = fs->root.dentry; + old_rootmnt = fs->root.mnt; + fs->root.mnt = mnt; + fs->root.dentry = dentry; + write_unlock(&fs->lock); + if (old_root) { + dput(old_root); + mntput(old_rootmnt); + } +} + +static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct dentry *old_pwd; + struct vfsmount *old_pwdmnt; + + write_lock(&fs->lock); + old_pwd = fs->pwd.dentry; + old_pwdmnt = fs->pwd.mnt; + fs->pwd.mnt = mnt; + fs->pwd.dentry = dentry; + write_unlock(&fs->lock); + + if (old_pwd) { + dput(old_pwd); + mntput(old_pwdmnt); + } +} + + +int rst_restore_fs(struct cpt_context *ctx) +{ + loff_t pos; + cpt_object_t *obj; + int err = 0; + + for_each_object(obj, CPT_OBJ_FS) { + struct cpt_fs_struct_image fi; + struct fs_struct *fs = obj->o_obj; + int i; + struct dentry *d[3]; + struct vfsmount *m[3]; + + err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx); + if (err) + return err; + + fs->umask = fi.cpt_umask; + + pos = obj->o_pos + fi.cpt_hdrlen; + d[0] = d[1] = d[2] = NULL; + m[0] = m[1] = m[2] = NULL; + i = 0; + while (pos < obj->o_pos + fi.cpt_next && i<3) { + err = cpt_get_dentry(d+i, m+i, &pos, ctx); + if (err) { + eprintk_ctx("cannot get_dir: %d", err); + for (--i; i >= 0; i--) { + if (d[i]) + dput(d[i]); + if (m[i]) + mntput(m[i]); + } + return err; + } + i++; + } + if (d[0]) + __set_fs_root(fs, m[0], d[0]); + if (d[1]) + __set_fs_pwd(fs, m[1], d[1]); + if (d[2]) + wprintk_ctx("altroot arrived...\n"); + } + return err; +} + +int do_one_mount(char *mntpnt, char *mnttype, char *mntbind, + unsigned long flags, unsigned long mnt_flags, + struct cpt_context *ctx) +{ + int err; + + if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0)) + mntbind = NULL; + + if (mntbind) + flags |= MS_BIND; + /* Join per-mountpoint flags with global flags */ + if (mnt_flags & MNT_NOSUID) + flags |= MS_NOSUID; + if (mnt_flags & MNT_NODEV) + flags |= MS_NODEV; + if (mnt_flags & MNT_NOEXEC) + flags |= MS_NOEXEC; + + err = sc_mount(mntbind, mntpnt, mnttype, flags); + if (err < 0) { + eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags); + return err; + } + return 0; +} + +static int undumptmpfs(void *arg) +{ + int i; + int *pfd = arg; + int fd1, fd2, err; + char *argv[] = { "tar", "x", "-C", "/", "-S", NULL }; + + if (pfd[0] != 0) + sc_dup2(pfd[0], 0); + + set_fs(KERNEL_DS); + fd1 = sc_open("/dev/null", O_WRONLY, 0); + fd2 = sc_open("/dev/null", O_WRONLY, 0); +try: + if (fd1 < 0 || fd2 < 0) { + if (fd1 == -ENOENT && fd2 == -ENOENT) { + err = sc_mknod("/dev/null", S_IFCHR|0666, + new_encode_dev((MEM_MAJOR<files->fdt->max_fds; i++) + sc_close(i); + + module_put(THIS_MODULE); + + i = sc_execve("/bin/tar", argv, NULL); + eprintk("failed to exec /bin/tar: %d\n", i); + return 255 << 8; +} + +static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx) +{ + int err; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + int n; + loff_t end; + int pid; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx); + if (err < 0) + return err; + + err = sc_pipe(pfd); + if (err < 0) + return err; + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[1]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + ctx->file->f_pos = *pos + v.cpt_hdrlen; + end = *pos + v.cpt_next; + *pos += v.cpt_next; + do { + char buf[16]; + + n = end - ctx->file->f_pos; + if (n > sizeof(buf)) + n = sizeof(buf); + + if (ctx->read(buf, n, ctx)) + break; + oldfs = get_fs(); set_fs(KERNEL_DS); + f->f_op->write(f, buf, n, &f->f_pos); + set_fs(oldfs); + } while (ctx->file->f_pos < end); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("tar exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("tar terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + return err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +int check_ext_mount(char *mntpnt, char *mnttype, struct cpt_context *ctx) +{ + struct mnt_namespace *n; + struct list_head *p; + struct vfsmount *t; + char *path, *path_buf; + int ret; + + n = current->nsproxy->mnt_ns; + ret = -ENOENT; + path_buf = cpt_get_buf(ctx); + down_read(&namespace_sem); + list_for_each(p, &n->list) { + struct path pt; + t = list_entry(p, struct vfsmount, mnt_list); + pt.dentry = t->mnt_root; + pt.mnt = t; + path = d_path(&pt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + if (!strcmp(path, mntpnt) && + !strcmp(t->mnt_sb->s_type->name, mnttype)) { + ret = 0; + break; + } + } + up_read(&namespace_sem); + __cpt_release_buf(ctx); + return ret; +} + +int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t endpos; + + endpos = pos + mi->cpt_next; + pos += mi->cpt_hdrlen; + + while (pos < endpos) { + char *mntdev; + char *mntpnt; + char *mnttype; + char *mntbind; + + mntdev = __rst_get_name(&pos, ctx); + mntpnt = __rst_get_name(&pos, ctx); + mnttype = __rst_get_name(&pos, ctx); + mntbind = NULL; + if (mi->cpt_mntflags & CPT_MNT_BIND) + mntbind = __rst_get_name(&pos, ctx); + err = -EINVAL; + if (mnttype && mntpnt) { + err = 0; + if (!(mi->cpt_mntflags & CPT_MNT_EXT) && + strcmp(mntpnt, "/")) { + err = do_one_mount(mntpnt, mnttype, mntbind, + mi->cpt_flags, + mi->cpt_mntflags, ctx); + if (!err && + strcmp(mnttype, "tmpfs") == 0 && + !(mi->cpt_mntflags & (CPT_MNT_BIND))) + err = rst_restore_tmpfs(&pos, ctx); + } else if (mi->cpt_mntflags & CPT_MNT_EXT) { + err = check_ext_mount(mntpnt, mnttype, ctx); + if (err) + eprintk_ctx("mount point is missing: %s\n", mntpnt); + } + } + if (mntdev) + rst_put_name(mntdev, ctx); + if (mntpnt) + rst_put_name(mntpnt, ctx); + if (mnttype) + rst_put_name(mnttype, ctx); + if (mntbind) + rst_put_name(mntbind, ctx); + if (err) + return err; + } + return 0; +} + +int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx) +{ + int err; + struct cpt_vfsmount_image mi; + + while (pos < endpos) { + err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx); + if (err) + return err; + err = restore_one_vfsmount(&mi, pos, ctx); + if (err) + return err; + pos += mi.cpt_next; + } + return 0; +} + +int rst_root_namespace(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_NAMESPACE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr sbuf; + int done = 0; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx); + if (err) + return err; + if (done) { + eprintk_ctx("multiple namespaces are not supported\n"); + break; + } + done++; + err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + + return 0; +} + +int rst_stray_files(struct cpt_context *ctx) +{ + int err = 0; + loff_t sec = ctx->sections[CPT_SECT_FILES]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_object_hdr sbuf; + cpt_object_t *obj; + + err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx); + if (err) + break; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx); + if (!obj) { + struct file *file; + + dprintk_ctx("stray file %Ld\n", sec); + + file = rst_sysv_shm_itself(sec, ctx); + + if (IS_ERR(file)) { + eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } else { + fput(file); + } + } + sec += sbuf.cpt_next; + } + + return err; +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_inotify.c linux-2.6.32.48-openvz/kernel/cpt/rst_inotify.c --- linux-2.6.32.48/kernel/cpt/rst_inotify.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_inotify.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,188 @@ +/* + * + * kernel/cpt/rst_inotify.c + * + * Copyright (C) 2000-2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +struct file *rst_open_inotify(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + struct file *file; + int fd; + + fd = sys_inotify_init(); + if (fd < 0) + return ERR_PTR(fd); + + file = fget(fd); + sys_close(fd); + return file; +} + +static int restore_one_inotify(cpt_object_t *obj, + loff_t pos, + struct cpt_inotify_image *ibuf, + cpt_context_t *ctx) +{ + int err = 0; + loff_t endpos; + struct file *file = obj->o_obj; + struct fsnotify_group *group; + + if (file->f_op != &inotify_fops) { + eprintk_ctx("bad inotify file\n"); + return -EINVAL; + } + + group = file->private_data; + + if (unlikely(group == NULL)) { + eprintk_ctx("bad inotify device\n"); + return -EINVAL; + } + + endpos = pos + ibuf->cpt_next; + pos += ibuf->cpt_hdrlen; + while (pos < endpos) { + union { + struct cpt_inotify_wd_image wi; + struct cpt_inotify_ev_image ei; + } u; + + err = rst_get_object(-1, pos, &u, ctx); + if (err) { + eprintk_ctx("rst_get_object: %d\n", err); + return err; + } + if (u.wi.cpt_object == CPT_OBJ_INOTIFY_WATCH) { + struct path p; + loff_t fpos = pos + u.wi.cpt_hdrlen; + + err = cpt_get_dentry(&p.dentry, &p.mnt, &fpos, ctx); + if (err) { + eprintk_ctx("cpt_get_dentry: %d\n", err); + return err; + } + + err = __inotify_new_watch(group, &p, u.wi.cpt_mask, u.wi.cpt_wd); + path_put(&p); + if (err < 0) + break; + + err = 0; /* for proper returt value */ + } else if (u.wi.cpt_object == CPT_OBJ_INOTIFY_EVENT) { +#if 0 + struct inotify_user_watch dummy_watch; + struct inotify_watch *w; + char *name = NULL; + + if (u.ei.cpt_namelen) { + name = kmalloc(u.ei.cpt_namelen+1, GFP_KERNEL); + if (name == NULL) { + err = -ENOMEM; + break; + } + name[u.ei.cpt_namelen] = 0; + err = ctx->pread(name, u.ei.cpt_namelen, ctx, pos + u.ei.cpt_hdrlen); + if (err) { + kfree(name); + break; + } + } + + w = &dummy_watch.wdata; + dummy_watch.dev = dev; + atomic_set(&w->count, 2); + + /* Trick to avoid destruction due to exit event */ + if (u.ei.cpt_mask & (IN_IGNORED | IN_ONESHOT)) + atomic_inc(&w->count); + dev->ih->in_ops->handle_event(w, u.ei.cpt_wd, u.ei.cpt_mask, + u.ei.cpt_cookie, name, NULL); + if (name) + kfree(name); +#endif + wprintk_ctx("inotify events dropped\n"); + } else { + eprintk_ctx("bad object: %u\n", u.wi.cpt_object); + err = -EINVAL; + break; + } + pos += u.wi.cpt_next; + } + return err; +} + +int rst_inotify(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_INOTIFY]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_INOTIFY || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_inotify_image ibuf; + + err = rst_get_object(CPT_OBJ_INOTIFY, sec, &ibuf, ctx); + if (err) + return err; + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ibuf.cpt_file, ctx); + if (obj == NULL) { + eprintk_ctx("cannot find inotify file object\n"); + return -EINVAL; + } + err = restore_one_inotify(obj, sec, &ibuf, ctx); + if (err) + return err; + sec += ibuf.cpt_next; + } + + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_mm.c linux-2.6.32.48-openvz/kernel/cpt/rst_mm.c --- linux-2.6.32.48/kernel/cpt/rst_mm.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_mm.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,1152 @@ +/* + * + * kernel/cpt/rst_mm.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#include +#endif +#include +#include +#include +#include + +#ifdef CONFIG_VE +#include +#include +#endif + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_ubc.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +#include "cpt_pagein.h" +#endif + +#include "cpt_syscalls.h" + +#define __PAGE_NX (1ULL<<63) + +static unsigned long make_prot(struct cpt_vma_image *vmai) +{ + unsigned long prot = 0; + + if (vmai->cpt_flags&VM_READ) + prot |= PROT_READ; + if (vmai->cpt_flags&VM_WRITE) + prot |= PROT_WRITE; + if (vmai->cpt_flags&VM_EXEC) + prot |= PROT_EXEC; + if (vmai->cpt_flags&VM_GROWSDOWN) + prot |= PROT_GROWSDOWN; + if (vmai->cpt_flags&VM_GROWSUP) + prot |= PROT_GROWSUP; + return prot; +} + +static unsigned long make_flags(struct cpt_vma_image *vmai) +{ + unsigned long flags = MAP_FIXED; + + if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE)) + flags |= MAP_SHARED; + else + flags |= MAP_PRIVATE; + + if (vmai->cpt_file == CPT_NULL) + flags |= MAP_ANONYMOUS; + if (vmai->cpt_flags&VM_GROWSDOWN) + flags |= MAP_GROWSDOWN; +#ifdef MAP_GROWSUP + if (vmai->cpt_flags&VM_GROWSUP) + flags |= MAP_GROWSUP; +#endif + if (vmai->cpt_flags&VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vmai->cpt_flags&VM_EXECUTABLE) + flags |= MAP_EXECUTABLE; + if (!(vmai->cpt_flags&VM_ACCOUNT)) + flags |= MAP_NORESERVE; + return flags; +} + +#ifdef CONFIG_X86 +#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) \ + && !defined(CONFIG_XEN) +static int __alloc_ldt(mm_context_t *pc, int mincount) +{ + int oldsize, newsize, nr; + + if (mincount <= pc->size) + return 0; + /* + * LDT got larger - reallocate if necessary. + */ + oldsize = pc->size; + mincount = (mincount+511)&(~511); + newsize = mincount*LDT_ENTRY_SIZE; + for (nr = 0; nr * PAGE_SIZE < newsize; nr++) { + BUG_ON(nr * PAGE_SIZE >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC); + if (!pc->ldt_pages[nr]) + goto nomem; + clear_highpage(pc->ldt_pages[nr]); + } + } + pc->size = mincount; + return 0; + +nomem: + while (--nr >= 0) + __free_page(pc->ldt_pages[nr]); + pc->size = 0; + return -ENOMEM; +} + +static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) +{ + struct mm_struct *mm = current->mm; + int i; + int err; + int size; + + err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE); + if (err) + return err; + + size = mm->context.size*LDT_ENTRY_SIZE; + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i); + kunmap(mm->context.ldt_pages[nr]); + if (err) + return err; + } + + load_LDT(&mm->context); + return 0; +} + +#else + +static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) +{ + struct mm_struct *mm = current->mm; + int oldsize = mm->context.size; + void *oldldt; + void *newldt; + int err; + + if (li->cpt_size > PAGE_SIZE) + newldt = vmalloc(li->cpt_size); + else + newldt = kmalloc(li->cpt_size, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen); + if (err) + return err; + + oldldt = mm->context.ldt; + mm->context.ldt = newldt; + mm->context.size = li->cpt_size/LDT_ENTRY_SIZE; + + load_LDT(&mm->context); + + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} +#endif +#endif + +static int +restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg) +{ + struct aio_ring_info *info = &aio_ctx->ring_info; + unsigned nr_events = aio_ctx->max_reqs; + unsigned long size; + int nr_pages; + + /* We recalculate parameters of the ring exactly like + * fs/aio.c does and then compare calculated values + * with ones, stored in dump. They must be the same. */ + + nr_events += 2; + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_events; + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + + if (nr_pages != aimg->cpt_ring_pages) + return -EINVAL; + + info->nr_pages = nr_pages; + + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + + if (nr_events != aimg->cpt_nr) + return -EINVAL; + + info->nr = 0; + info->ring_pages = info->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + if (!info->ring_pages) + return -ENOMEM; + memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + } + + info->mmap_size = nr_pages * PAGE_SIZE; + + /* This piece of shit is not entirely my fault. Kernel aio.c makes + * something odd mmap()ping some pages and then pinning them. + * I guess it is just some mud remained of failed attempt to show ring + * to user space. The result is odd. :-) Immediately after + * creation of AIO context, kernel shares those pages with user + * and user can read and even write there. But after the first + * fork, pages are marked COW with evident consequences. + * I remember, I did the same mistake in the first version + * of mmapped packet socket, luckily that crap never reached + * mainstream. + * + * So, what are we going to do? I can simulate this odd behaviour + * exactly, but I am not insane yet. For now just take the pages + * from user space. Alternatively, we could keep kernel copy + * in AIO context image, which would be more correct. + * + * What is wrong now? If the pages are COWed, ring is transferred + * incorrectly. + */ + down_read(¤t->mm->mmap_sem); + info->mmap_base = aimg->cpt_mmap_base; + info->nr_pages = get_user_pages(current, current->mm, + info->mmap_base, nr_pages, + 1, 0, info->ring_pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (unlikely(info->nr_pages != nr_pages)) { + int i; + + for (i=0; inr_pages; i++) + put_page(info->ring_pages[i]); + if (info->ring_pages && info->ring_pages != info->internal_pages) + kfree(info->ring_pages); + return -EFAULT; + } + + aio_ctx->user_id = info->mmap_base; + + info->nr = nr_events; + info->tail = aimg->cpt_tail; + + return 0; +} + +static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx) +{ + int err; + struct kioctx *aio_ctx; + extern spinlock_t aio_nr_lock; + + aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (!aio_ctx) + return -ENOMEM; + + memset(aio_ctx, 0, sizeof(*aio_ctx)); + aio_ctx->max_reqs = aimg->cpt_max_reqs; + + if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) { + kmem_cache_free(kioctx_cachep, aio_ctx); + eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err); + return err; + } + + aio_ctx->mm = current->mm; + atomic_inc(&aio_ctx->mm->mm_count); + atomic_set(&aio_ctx->users, 1); + spin_lock_init(&aio_ctx->ctx_lock); + spin_lock_init(&aio_ctx->ring_info.ring_lock); + init_waitqueue_head(&aio_ctx->wait); + INIT_LIST_HEAD(&aio_ctx->active_reqs); + INIT_LIST_HEAD(&aio_ctx->run_list); + INIT_WORK(&aio_ctx->wq.work, aio_kick_handler); + + spin_lock(&aio_nr_lock); + aio_nr += aio_ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + spin_lock(&aio_ctx->mm->ioctx_lock); + hlist_add_head(&aio_ctx->list, &aio_ctx->mm->ioctx_list); + spin_unlock(&aio_ctx->mm->ioctx_lock); + + return 0; +} + +struct anonvma_map +{ + struct hlist_node list; + struct anon_vma *avma; + __u64 id; +}; + +static int verify_create_anonvma(struct mm_struct *mm, + struct cpt_vma_image *vmai, + cpt_context_t *ctx) +{ + struct anon_vma *avma = NULL; + struct anon_vma *new_avma; + struct vm_area_struct *vma; + int h; + + if (!ctx->anonvmas) { + if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE) + return -EINVAL; + if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL) + return -ENOMEM; + for (h = 0; h < CPT_ANONVMA_HSIZE; h++) + INIT_HLIST_HEAD(&ctx->anonvmas[h]); + } else { + struct anonvma_map *map; + struct hlist_node *elem; + + h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); + hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) { + if (map->id == vmai->cpt_anonvmaid) { + avma = map->avma; + break; + } + } + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + return -ESRCH; + } + if (vma->vm_start != vmai->cpt_start) { + up_read(&mm->mmap_sem); + eprintk_ctx("vma start mismatch\n"); + return -EINVAL; + } + if (vma->vm_pgoff != vmai->cpt_pgoff) { + dprintk_ctx("vma pgoff mismatch, fixing\n"); + if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) { + eprintk_ctx("cannot fixup vma pgoff\n"); + up_read(&mm->mmap_sem); + return -EINVAL; + } + vma->vm_pgoff = vmai->cpt_pgoff; + } + + if (!vma->anon_vma) { + if (avma) { + vma->anon_vma = avma; + anon_vma_link(vma); + } else { + int err; + + err = anon_vma_prepare(vma); + + if (err) { + up_read(&mm->mmap_sem); + return err; + } + } + } else { + /* Note, we _can_ arrive to the situation, when two + * different anonvmaid's point to one anon_vma, this happens + * f.e. when mmap() merged new area to previous one and + * they will share one anon_vma even if they did not on + * original host. + * + * IT IS OK. To all that I understand, we may merge all + * the anon_vma's and rmap can scan all the huge list of vmas + * searching for page. It is just "suboptimal". + * + * Real disaster would happen, if vma already got an anon_vma + * with different id. It is very rare case, kernel does the + * best efforts to merge anon_vmas when some attributes are + * different. In this case we will fall to copying memory. + */ + if (avma && vma->anon_vma != avma) { + up_read(&mm->mmap_sem); + wprintk_ctx("anon_vma mismatch\n"); + return 0; + } + } + + new_avma = vma->anon_vma; + up_read(&mm->mmap_sem); + + if (!avma) { + struct anonvma_map *map; + + if (!new_avma) + return -EINVAL; + + if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL) + return -ENOMEM; + + map->id = vmai->cpt_anonvmaid; + map->avma = new_avma; + h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); + hlist_add_head(&map->list, &ctx->anonvmas[h]); + } + return 0; +} + +static int copy_mm_pages(struct mm_struct *src, unsigned long start, + unsigned long end) +{ + int err; + + for (; start < end; start += PAGE_SIZE) { + struct page *page; + struct page *spage; + void *maddr, *srcaddr; + + err = get_user_pages(current, current->mm, + start, 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) + return err; + + err = get_user_pages(current, src, + start, 1, 0, 1, &spage, NULL); + + if (err == 0) + err = -EFAULT; + if (err < 0) { + page_cache_release(page); + return err; + } + + srcaddr = kmap(spage); + maddr = kmap(page); + memcpy(maddr, srcaddr, PAGE_SIZE); + set_page_dirty_lock(page); + kunmap(page); + kunmap(spage); + page_cache_release(page); + page_cache_release(spage); + } + return 0; +} + +#include + +static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx) +{ + int err = 0; + unsigned long addr; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct file *file = NULL; + unsigned long prot; + int checked = 0; + + if (vmai->cpt_type == CPT_VMA_VDSO) { + if (ctx->vdso == NULL) { +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES + err = arch_setup_additional_pages(NULL, 0, + vmai->cpt_start); +#endif + goto out; + } + } + + prot = make_prot(vmai); + + if (vmai->cpt_file != CPT_NULL) { + if (vmai->cpt_type == CPT_VMA_TYPE_0) { + file = rst_file(vmai->cpt_file, -1, ctx); + if (IS_ERR(file)) { + eprintk_ctx("do_rst_vma: rst_file: %Ld\n", (unsigned long long)vmai->cpt_file); + return PTR_ERR(file); + } + } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) { + file = rst_sysv_shm_vma(vmai, ctx); + if (IS_ERR(file)) + return PTR_ERR(file); + } + } + + down_write(&mm->mmap_sem); + + if ((make_flags(vmai) & VM_EXECUTABLE) && mm->exe_file != file) + set_mm_exe_file(mm, file); + + addr = do_mmap_pgoff(file, vmai->cpt_start, + vmai->cpt_end-vmai->cpt_start, + prot, make_flags(vmai), + vmai->cpt_pgoff); + + if (addr != vmai->cpt_start) { + up_write(&mm->mmap_sem); + + err = -EINVAL; + if (IS_ERR((void*)addr)) + err = addr; + goto out; + } + + vma = find_vma(mm, vmai->cpt_start); + if (vma == NULL) { + up_write(&mm->mmap_sem); + eprintk_ctx("cannot find mmapped vma\n"); + err = -ESRCH; + goto out; + } + + /* do_mmap_pgoff() can merge new area to previous one (not to the next, + * we mmap in order, the rest of mm is still unmapped). This can happen + * f.e. if flags are to be adjusted later, or if we had different + * anon_vma on two adjacent regions. Split it by brute force. */ + if (vma->vm_start != vmai->cpt_start) { + dprintk_ctx("vma %Ld merged, split\n", vmapos); + err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0); + if (err) { + up_write(&mm->mmap_sem); + eprintk_ctx("cannot split vma\n"); + goto out; + } + } + up_write(&mm->mmap_sem); + + if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) { + err = verify_create_anonvma(mm, vmai, ctx); + if (err) { + eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos); + goto out; + } + } + + if (vmai->cpt_type == CPT_VMA_VDSO) { + struct page *page; + void *maddr; + + err = get_user_pages(current, current->mm, + (unsigned long)vmai->cpt_start, + 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) { + eprintk_ctx("can't get vdso: get_user_pages: %d\n", err); + goto out; + } + err = 0; + maddr = kmap(page); + memcpy(maddr, ctx->vdso, PAGE_SIZE); + set_page_dirty_lock(page); + kunmap(page); + page_cache_release(page); + goto out; + } + + if (vmai->cpt_next > vmai->cpt_hdrlen) { + loff_t offset = vmapos + vmai->cpt_hdrlen; + + do { + union { + struct cpt_page_block pb; + struct cpt_remappage_block rpb; + struct cpt_copypage_block cpb; + struct cpt_lazypage_block lpb; + struct cpt_iterpage_block ipb; + } u; + loff_t pos; + + err = rst_get_object(-1, offset, &u, ctx); + if (err) { + eprintk_ctx("vma fix object: %d\n", err); + goto out; + } + if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) { + err = sc_remap_file_pages(u.rpb.cpt_start, + u.rpb.cpt_end-u.rpb.cpt_start, + 0, u.rpb.cpt_pgoff, 0); + if (err < 0) { + eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err, + (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), + (__u32)u.rpb.cpt_pgoff); + goto out; + } + offset += u.rpb.cpt_next; + continue; + } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) { +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + unsigned long ptr = u.lpb.cpt_start; + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + err = anon_vma_prepare(vma); + if (err) { + up_read(&mm->mmap_sem); + goto out; + } + while (ptr < u.lpb.cpt_end) { + err = rst_pagein(vma, u.lpb.cpt_index + (ptr-u.lpb.cpt_start)/PAGE_SIZE, + ptr, ctx); + if (err) + break; + ptr += PAGE_SIZE; + } + up_read(&mm->mmap_sem); +#else + err = -EINVAL; +#endif + if (err) + goto out; + offset += u.cpb.cpt_next; + continue; + } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) { + struct vm_area_struct *vma, *vma1; + struct mm_struct *src; + struct anon_vma *src_anon; + cpt_object_t *mobj; + + if (!vmai->cpt_anonvmaid) { + err = -EINVAL; + eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n"); + goto out; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx); + if (!mobj) { + eprintk_ctx("lost mm_struct to clone pages from\n"); + err = -ESRCH; + goto out; + } + src = mobj->o_obj; + + down_read(&src->mmap_sem); + src_anon = NULL; + vma1 = find_vma(src, u.cpb.cpt_start); + if (vma1) + src_anon = vma1->anon_vma; + up_read(&src->mmap_sem); + + if (!vma1) { + eprintk_ctx("lost src vm_area_struct\n"); + err = -ESRCH; + goto out; + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + + if (!src_anon || + !vma->anon_vma || + vma->anon_vma != src_anon || + vma->vm_start - vma1->vm_start != + (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) { + up_read(&mm->mmap_sem); + wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos); + err = copy_mm_pages(mobj->o_obj, + u.cpb.cpt_start, + u.cpb.cpt_end); + } else { + err = __copy_page_range(vma, vma1, + u.cpb.cpt_start, + u.cpb.cpt_end-u.cpb.cpt_start); + up_read(&mm->mmap_sem); + } + if (err) { + eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err, + (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), + (long)u.cpb.cpt_source); + goto out; + } + + offset += u.cpb.cpt_next; + continue; + } else if (u.pb.cpt_object == CPT_OBJ_ITERPAGES || + u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES + ) { +#ifdef CONFIG_VZ_CHECKPOINT_ITER + unsigned long ptr = u.lpb.cpt_start; + u64 page_pos[16]; + pos = offset + sizeof(u.pb); + + err = ctx->pread(&page_pos, + 8*(u.lpb.cpt_end-ptr)/PAGE_SIZE, + ctx, + pos); + if (err) { + eprintk_ctx("Oops\n"); + goto out; + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + err = anon_vma_prepare(vma); + if (err) { + up_read(&mm->mmap_sem); + goto out; + } + while (ptr < u.lpb.cpt_end) { + err = rst_iter(vma, + page_pos[(ptr-u.lpb.cpt_start)/PAGE_SIZE], + ptr, + ctx); + if (err) + break; + ptr += PAGE_SIZE; + } + if (u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES) { + make_pages_present((unsigned long)u.lpb.cpt_start, + (unsigned long)u.lpb.cpt_end); + } + up_read(&mm->mmap_sem); +#else + err = -EINVAL; +#endif + if (err) + goto out; + offset += u.cpb.cpt_next; + continue; + } + if (u.pb.cpt_object != CPT_OBJ_PAGES) { + eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object); + err = -EINVAL; + goto out; + } + pos = offset + sizeof(u.pb); + if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) { + /* I guess this is get_user_pages() messed things, + * this happens f.e. when gdb inserts breakpoints. + */ + int i; + for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) { + struct page *page; + void *maddr; + err = get_user_pages(current, current->mm, + (unsigned long)u.pb.cpt_start + i*PAGE_SIZE, + 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) { + eprintk_ctx("get_user_pages: %d\n", err); + goto out; + } + err = 0; + maddr = kmap(page); + if (u.pb.cpt_content == CPT_CONTENT_VOID) { + memset(maddr, 0, PAGE_SIZE); + } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { + err = ctx->pread(maddr, PAGE_SIZE, + ctx, pos + i*PAGE_SIZE); + if (err) { + kunmap(page); + goto out; + } + } else { + err = -EINVAL; + kunmap(page); + goto out; + } + set_page_dirty_lock(page); + kunmap(page); + page_cache_release(page); + } + } else { + if (!(prot&PROT_WRITE)) + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); + if (u.pb.cpt_content == CPT_CONTENT_VOID) { + int i; + for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) { + err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i); + if (err) { + eprintk_ctx("__put_user 2 %d\n", err); + goto out; + } + } + } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { + loff_t tpos = pos; + err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start), + u.pb.cpt_end-u.pb.cpt_start, + &tpos); + if (err != u.pb.cpt_end-u.pb.cpt_start) { + if (err >= 0) + err = -EIO; + goto out; + } + } else { + err = -EINVAL; + goto out; + } + if (!(prot&PROT_WRITE)) + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); + } + err = 0; + offset += u.pb.cpt_next; + } while (offset < vmapos + vmai->cpt_next); + } + +check: + do { + struct vm_area_struct *vma; + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + if (vma) { + if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) { + VM_ClearReadHint(vma); + vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK; + } + if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) { + dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos); + up_read(&mm->mmap_sem); + if (vma->vm_flags&VM_LOCKED) + err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); + else + err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); + /* When mlock fails with EFAULT, it means + * that it could not bring in pages. + * It can happen after mlock() on unreadable + * VMAs. But VMA is correctly locked, + * so that this error can be ignored. */ + if (err == -EFAULT) + err = 0; + if (err) + goto out; + goto check; + } + if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX) + wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, + (unsigned long long)vma->vm_page_prot.pgprot, + (unsigned long long)vmai->cpt_pgprot); +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) && + (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE)) + wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, + (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); +#endif + if (vma->vm_flags != vmai->cpt_flags) { + unsigned long x = vma->vm_flags ^ vmai->cpt_flags; + if (x & VM_EXEC) { + /* Crap. On i386 this is OK. + * It is impossible to make via mmap/mprotect + * exec.c clears VM_EXEC on stack. */ + vma->vm_flags &= ~VM_EXEC; + } else if ((x & VM_ACCOUNT) && !checked) { + checked = 1; + if (!(prot&PROT_WRITE)) { + up_read(&mm->mmap_sem); + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); + goto check; + } + wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, + (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); + } else { + wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, + (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); + } + } + } else { + wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos); + } + up_read(&mm->mmap_sem); + } while (0); + +out: + if (file) + fput(file); + return err; +} + +#ifndef CONFIG_IA64 +#define TASK_UNMAP_START 0 +#else +/* On IA64 the first page is a special VM_IO|VM_RESERVED mapping + * used to accelerate speculative dereferences of NULL pointer. */ +#define TASK_UNMAP_START PAGE_SIZE +#endif + +static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx) +{ + int err = 0; + unsigned int def_flags; + struct mm_struct *mm = current->mm; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + down_write(&mm->mmap_sem); + do_munmap(mm, TASK_UNMAP_START, TASK_SIZE-TASK_UNMAP_START); + +#ifdef CONFIG_BEANCOUNTERS + /* + * MM beancounter is usually correct from the fork time, + * but not for init, for example. + * Luckily, mm_ub can be changed for a completely empty MM. + */ + bc = rst_lookup_ubc(vmi->cpt_mmub, ctx); + err = virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_RSTMM, bc); + if (err & NOTIFY_FAIL) { + up_write(&mm->mmap_sem); + return -ECHRNG; + } + if ((err & VIRTNOTIFY_CHANGE) && bc != mm->mm_ub) { + struct user_beancounter *old_bc; + + old_bc = mm->mm_ub; + mm->mm_ub = bc; + bc = old_bc; + } + err = 0; + put_beancounter(bc); +#endif + + mm->start_code = vmi->cpt_start_code; + mm->end_code = vmi->cpt_end_code; + mm->start_data = vmi->cpt_start_data; + mm->end_data = vmi->cpt_end_data; + mm->start_brk = vmi->cpt_start_brk; + mm->brk = vmi->cpt_brk; + mm->start_stack = vmi->cpt_start_stack; + mm->arg_start = vmi->cpt_start_arg; + mm->arg_end = vmi->cpt_end_arg; + mm->env_start = vmi->cpt_start_env; + mm->env_end = vmi->cpt_end_env; + mm->def_flags = 0; + def_flags = vmi->cpt_def_flags; + + mm->flags = vmi->cpt_dumpable; + if (ctx->image_version < CPT_VERSION_24) + mm->flags |= MMF_DUMP_FILTER_DEFAULT << MMF_DUMPABLE_BITS; + + mm->vps_dumpable = vmi->cpt_vps_dumpable; +#ifndef CONFIG_IA64 + if (ctx->image_version >= CPT_VERSION_9) { + mm->context.vdso = cpt_ptr_import(vmi->cpt_vdso); +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) + current_thread_info()->sysenter_return = + VDSO32_SYMBOL(mm->context.vdso, SYSENTER_RETURN); +#endif + } +#endif + +#if 0 /* def CONFIG_HUGETLB_PAGE*/ +/* NB: ? */ + int used_hugetlb; +#endif + up_write(&mm->mmap_sem); + + if (vmi->cpt_next > vmi->cpt_hdrlen) { + loff_t offset = pos + vmi->cpt_hdrlen; + do { + union { + struct cpt_vma_image vmai; + struct cpt_aio_ctx_image aioi; + struct cpt_obj_bits bits; + } u; + err = rst_get_object(-1, offset, &u, ctx); + if (err) + goto out; + if (u.vmai.cpt_object == CPT_OBJ_VMA) { +#ifdef CONFIG_IA64 + //// Later... + if (u.vmai.cpt_start) +#endif + err = do_rst_vma(&u.vmai, offset, pos, ctx); + if (err) + goto out; +#ifdef CONFIG_X86 + } else if (u.bits.cpt_object == CPT_OBJ_BITS && + u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) { + err = do_rst_ldt(&u.bits, offset, ctx); + if (err) + goto out; +#endif + } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) { + err = do_rst_aio(&u.aioi, offset, ctx); + if (err) + goto out; + } else { + eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object); + err = -EINVAL; + goto out; + } + offset += u.vmai.cpt_next; + } while (offset < pos + vmi->cpt_next); + } + + down_write(&mm->mmap_sem); + mm->def_flags = def_flags; + up_write(&mm->mmap_sem); + + +out: + return err; +} + +extern void exit_mm(struct task_struct * tsk); + +int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err = 0; + cpt_object_t *mobj; + void *tmp = (void*)__get_free_page(GFP_KERNEL); + struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp; + + if (!tmp) + return -ENOMEM; + + if (ti->cpt_mm == CPT_NULL) { + if (current->mm) { + virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, + current); + exit_mm(current); + } + goto out; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); + if (mobj) { + if (current->mm != mobj->o_obj) BUG(); + goto out; + } + + if (current->mm == NULL) { + struct mm_struct *mm = mm_alloc(); + if (mm == NULL) { + err = -ENOMEM; + goto out; + } + err = init_new_context(current, mm); + if (err) { + mmdrop(mm); + goto out; + } + current->mm = mm; + } + + if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0) + goto out; + if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) { + eprintk_ctx("do_rst_mm %Ld\n", (unsigned long long)ti->cpt_mm); + goto out; + } + err = -ENOMEM; + mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx); + if (mobj != NULL) { + err = 0; + cpt_obj_setpos(mobj, ti->cpt_mm, ctx); + } + +out: + if (tmp) + free_page((unsigned long)tmp); + return err; +} + +/* This is part of mm setup, made in parent context. Mostly, it is the place, + * where we graft mm of another process to child. + */ + +int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + cpt_object_t *mobj; + + /* Task without mm. Just get rid of this. */ + if (ti->cpt_mm == CPT_NULL) { + if (tsk->mm) { + virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, + tsk); + mmput(tsk->mm); + tsk->mm = NULL; + } + return 0; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); + if (mobj) { + struct mm_struct *newmm = mobj->o_obj; + /* Good, the MM is already created. */ + if (newmm == tsk->mm) { + /* Already done by clone(). */ + return 0; + } + mmput(tsk->mm); + atomic_inc(&newmm->mm_users); + tsk->mm = newmm; + tsk->active_mm = newmm; + } + return 0; +} + +/* We use CLONE_VM when mm of child is going to be shared with parent. + * Otherwise mm is copied. + */ + +__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + if (ti->cpt_mm == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx)) + return CLONE_VM; + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_net.c linux-2.6.32.48-openvz/kernel/cpt/rst_net.c --- linux-2.6.32.48/kernel/cpt/rst_net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_net.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,745 @@ +/* + * + * kernel/cpt/rst_net.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" +#include "cpt_net.h" +#include "cpt_files.h" + +#include "cpt_syscalls.h" + +extern struct in_ifaddr *inet_alloc_ifa(void); +extern int inet_insert_ifa(struct in_ifaddr *ifa); +extern struct in_device *inetdev_init(struct net_device *dev); + +int rst_restore_ifaddr(struct cpt_context *ctx) +{ + struct net *net = get_exec_env()->ve_netns; + int err; + loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_ifaddr_image di; + struct net_device *dev; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int cindex = -1; + int err; + err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx); + if (err) + return err; + cindex = di.cpt_index; + rtnl_lock(); + dev = __dev_get_by_index(net, cindex); + if (dev && di.cpt_family == AF_INET) { + struct in_device *in_dev; + struct in_ifaddr *ifa; + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) + in_dev = inetdev_init(dev); + ifa = inet_alloc_ifa(); + if (ifa) { + ifa->ifa_local = di.cpt_address[0]; + ifa->ifa_address = di.cpt_peer[0]; + ifa->ifa_broadcast = di.cpt_broadcast[0]; + ifa->ifa_prefixlen = di.cpt_masklen; + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + ifa->ifa_flags = di.cpt_flags; + ifa->ifa_scope = di.cpt_scope; + memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + err = inet_insert_ifa(ifa); + if (err && err != -EEXIST) { + rtnl_unlock(); + eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); + return err; + } + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + } else if (dev && di.cpt_family == AF_INET6) { + __u32 prefered_lft; + __u32 valid_lft; + struct net *net = get_exec_env()->ve_ns->net_ns; + prefered_lft = (di.cpt_flags & IFA_F_DEPRECATED) ? + 0 : di.cpt_prefered_lft; + valid_lft = (di.cpt_flags & IFA_F_PERMANENT) ? + 0xFFFFFFFF : di.cpt_valid_lft; + err = inet6_addr_add(net, dev->ifindex, + (struct in6_addr *)di.cpt_address, + di.cpt_masklen, 0, + prefered_lft, + valid_lft); + if (err && err != -EEXIST) { + rtnl_unlock(); + eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); + return err; + } +#endif + } else { + rtnl_unlock(); + eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index); + return -EINVAL; + } + rtnl_unlock(); + sec += di.cpt_next; + } + return 0; +} + +static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx) +{ + int min_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + struct rtmsg *rtm = NLMSG_DATA(nlh); + __u32 prefix0 = 0; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(rta, attrlen)) { + if (rta->rta_type == RTA_DST) { + prefix0 = *(__u32*)RTA_DATA(rta); + } + rta = RTA_NEXT(rta, attrlen); + } + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (rtm->rtm_family == AF_INET6) { + if (rtm->rtm_type == RTN_LOCAL) + return 2; + if (rtm->rtm_flags & RTM_F_CLONED) + return 2; + if (rtm->rtm_protocol == RTPROT_UNSPEC || + rtm->rtm_protocol == RTPROT_RA || + rtm->rtm_protocol == RTPROT_REDIRECT || + rtm->rtm_protocol == RTPROT_KERNEL) + return 2; + if (rtm->rtm_protocol == RTPROT_BOOT && + ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) || + (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000)))) + return 2; + } +#endif + return rtm->rtm_protocol == RTPROT_KERNEL; +} + +int rst_restore_route(struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct msghdr msg; + struct iovec iov; + struct sockaddr_nl nladdr; + mm_segment_t oldfs; + loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr v; + char *pg; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + if (h.cpt_hdrlen >= h.cpt_next) + return 0; + + sec += h.cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx); + if (err < 0) + return err; + + err = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); + if (err) + return err; + + pg = (char*)__get_free_page(GFP_KERNEL); + if (pg == NULL) { + err = -ENOMEM; + goto out_sock; + } + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + endsec = sec + v.cpt_next; + sec += v.cpt_hdrlen; + + while (sec < endsec) { + struct nlmsghdr *n; + struct nlmsghdr nh; + int kernel_flag; + + if (endsec - sec < sizeof(nh)) + break; + + err = ctx->pread(&nh, sizeof(nh), ctx, sec); + if (err) + goto out_sock_pg; + if (nh.nlmsg_len < sizeof(nh) || nh.nlmsg_len > PAGE_SIZE || + endsec - sec < nh.nlmsg_len) { + err = -EINVAL; + goto out_sock_pg; + } + err = ctx->pread(pg, nh.nlmsg_len, ctx, sec); + if (err) + goto out_sock_pg; + + n = (struct nlmsghdr*)pg; + n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE; + + err = rewrite_rtmsg(n, ctx); + if (err < 0) + goto out_sock_pg; + kernel_flag = err; + + if (kernel_flag == 2) + goto do_next; + + iov.iov_base=n; + iov.iov_len=nh.nlmsg_len; + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, nh.nlmsg_len); + set_fs(oldfs); + + if (err < 0) + goto out_sock_pg; + err = 0; + + iov.iov_base=pg; + iov.iov_len=PAGE_SIZE; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); + set_fs(oldfs); + if (err != -EAGAIN) { + if (n->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *e = NLMSG_DATA(n); + if (e->error != -EEXIST || !kernel_flag) + eprintk_ctx("NLMERR: %d\n", e->error); + } else { + eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type); + } + } +do_next: + err = 0; + sec += NLMSG_ALIGN(nh.nlmsg_len); + } + +out_sock_pg: + free_page((unsigned long)pg); +out_sock: + sock_release(sock); + return err; +} + +int rst_resume_network(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + env->disable_net = 0; + put_ve(env); + return 0; +} + +static int rst_restore_netstats(loff_t pos, struct net_device *dev, + struct cpt_context * ctx) +{ + struct cpt_netstats_image *n; + struct net_device_stats *stats = NULL; + int err; + + if (!dev->netdev_ops->ndo_get_stats) + return 0; + + n = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_NET_STATS, pos, n, ctx); + if (err) + goto out; + BUG_ON(sizeof(struct cpt_netstats_image) != n->cpt_hdrlen); + preempt_disable(); + + if (dev->netdev_ops->ndo_cpt == NULL) { + err = -ENODEV; + eprintk_ctx("Network device %s is not supported\n", dev->name); + goto out; + } + + stats = dev->netdev_ops->ndo_get_stats(dev); + + stats->rx_packets = n->cpt_rx_packets; + stats->tx_packets = n->cpt_tx_packets; + stats->rx_bytes = n->cpt_rx_bytes; + stats->tx_bytes = n->cpt_tx_bytes; + stats->rx_errors = n->cpt_rx_errors; + stats->tx_errors = n->cpt_tx_errors; + stats->rx_dropped = n->cpt_rx_dropped; + stats->tx_dropped = n->cpt_tx_dropped; + stats->multicast = n->cpt_multicast; + stats->collisions = n->cpt_collisions; + stats->rx_length_errors = n->cpt_rx_length_errors; + stats->rx_over_errors = n->cpt_rx_over_errors; + stats->rx_crc_errors = n->cpt_rx_crc_errors; + stats->rx_frame_errors = n->cpt_rx_frame_errors; + stats->rx_fifo_errors = n->cpt_rx_fifo_errors; + stats->rx_missed_errors = n->cpt_rx_missed_errors; + stats->tx_aborted_errors = n->cpt_tx_aborted_errors; + stats->tx_carrier_errors = n->cpt_tx_carrier_errors; + stats->tx_fifo_errors = n->cpt_tx_fifo_errors; + stats->tx_heartbeat_errors = n->cpt_tx_heartbeat_errors; + stats->tx_window_errors = n->cpt_tx_window_errors; + stats->rx_compressed = n->cpt_rx_compressed; + stats->tx_compressed = n->cpt_tx_compressed; + +out: + preempt_enable(); + cpt_release_buf(ctx); + return err; +} + +int rst_restore_netdev(struct cpt_context *ctx) +{ + struct net *net = get_exec_env()->ve_netns; + int err; + loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_netdev_image di; + struct net_device *dev; + + get_exec_env()->disable_net = 1; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + loff_t pos; + struct net_device *dev_new; + struct netdev_rst *ops; + + err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx); + if (err) + return err; + + rtnl_lock(); + pos = sec + di.cpt_hdrlen; + if (di.cpt_next > sizeof(di)) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), + ctx, sec + di.cpt_hdrlen); + if (err) + goto out; + + ops = NULL; + while (1) { + ops = netdev_find_rst(hdr.cpt_object, ops); + if (ops == NULL) + break; + + err = ops->ndo_rst(sec, &di, &rst_ops, ctx); + if (!err) { + pos += hdr.cpt_next; + break; + } else if (err < 0) { + eprintk_ctx("netdev %d rst failed %d\n", + hdr.cpt_object, err); + goto out; + } + } + } + + dev = __dev_get_by_name(net, di.cpt_name); + if (dev) { + if (dev->ifindex != di.cpt_index) { + dev_new = __dev_get_by_index(net, di.cpt_index); + if (!dev_new) { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->index_hlist); + if (dev->iflink == dev->ifindex) + dev->iflink = di.cpt_index; + dev->ifindex = di.cpt_index; + hlist_add_head(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + write_unlock_bh(&dev_base_lock); + } else { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->index_hlist); + hlist_del(&dev_new->index_hlist); + if (dev_new->iflink == dev_new->ifindex) + dev_new->iflink = dev->ifindex; + dev_new->ifindex = dev->ifindex; + if (dev->iflink == dev->ifindex) + dev->iflink = di.cpt_index; + dev->ifindex = di.cpt_index; + hlist_add_head(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + hlist_add_head(&dev_new->index_hlist, + dev_index_hash(net, dev_new->ifindex)); + write_unlock_bh(&dev_base_lock); + } + } + if (di.cpt_flags^dev->flags) { + err = dev_change_flags(dev, di.cpt_flags); + if (err) + eprintk_ctx("dev_change_flags err: %d\n", err); + } + while (pos < sec + di.cpt_next) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), + ctx, pos); + if (err) + goto out; + if (hdr.cpt_object == CPT_OBJ_NET_HWADDR) { + /* Restore hardware address */ + struct cpt_hwaddr_image hw; + err = rst_get_object(CPT_OBJ_NET_HWADDR, + pos, &hw, ctx); + if (err) + goto out; + BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) != + MAX_ADDR_LEN); + memcpy(dev->dev_addr, hw.cpt_dev_addr, + sizeof(hw.cpt_dev_addr)); + } else if (hdr.cpt_object == CPT_OBJ_NET_STATS) { + err = rst_restore_netstats(pos, dev, ctx); + if (err) { + eprintk_ctx("rst stats %s: %d\n", + di.cpt_name, err); + goto out; + } + } + pos += hdr.cpt_next; + } + } else { + eprintk_ctx("unknown interface 2 %s\n", di.cpt_name); + } + rtnl_unlock(); + sec += di.cpt_next; + } + return 0; +out: + rtnl_unlock(); + return err; +} + +static int dumpfn(void *arg) +{ + int i; + int *pfd = arg; + char *argv[] = { "iptables-restore", "-c", NULL }; + + if (pfd[0] != 0) + sc_dup2(pfd[0], 0); + + for (i=1; ifiles->fdt->max_fds; i++) + sc_close(i); + + module_put(THIS_MODULE); + + set_fs(KERNEL_DS); + i = sc_execve("/sbin/iptables-restore", argv, NULL); + if (i == -ENOENT) + i = sc_execve("/usr/sbin/iptables-restore", argv, NULL); + eprintk("failed to exec iptables-restore: %d\n", i); + return 255 << 8; +} + +static int rst_restore_iptables(struct cpt_context * ctx) +{ + int err; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + int n; + struct cpt_section_hdr h; + loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES]; + loff_t end; + int pid; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + if (h.cpt_hdrlen == h.cpt_next) + return 0; + if (h.cpt_hdrlen > h.cpt_next) + return -EINVAL; + sec += h.cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx); + if (err < 0) + return err; + + err = sc_pipe(pfd); + if (err < 0) + return err; + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("iptables local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[1]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + ctx->file->f_pos = sec + v.cpt_hdrlen; + end = sec + v.cpt_next; + do { + char *p; + char buf[16]; + + n = end - ctx->file->f_pos; + if (n > sizeof(buf)) + n = sizeof(buf); + + if (ctx->read(buf, n, ctx)) + break; + if ((p = memchr(buf, 0, n)) != NULL) + n = p - buf; + oldfs = get_fs(); set_fs(KERNEL_DS); + f->f_op->write(f, buf, n, &f->f_pos); + set_fs(oldfs); + } while (ctx->file->f_pos < end); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-restore exited with %d\n", err); + eprintk_ctx("Most probably some iptables modules are not loaded\n"); + err = -EINVAL; + } + } else { + eprintk_ctx("iptables-restore terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + return err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +static int rst_restore_snmp_stat(struct cpt_context *ctx, void *mib[], int n, + loff_t *ppos, loff_t endpos) +{ + int err, in, i; + struct cpt_object_hdr o; + __u32 *stats; + + err = rst_get_object(CPT_OBJ_BITS, *ppos, &o, ctx); + if (err) + return err; + + in = o.cpt_next - o.cpt_hdrlen; + if (in >= PAGE_SIZE - 4) { + eprintk_ctx("Too long SNMP buf (%d)\n", in); + return -EINVAL; + } + + if (o.cpt_content != CPT_CONTENT_DATA) { + if (o.cpt_content == CPT_CONTENT_VOID) + return 1; + + eprintk_ctx("Corrupted SNMP stats\n"); + return -EINVAL; + } + + stats = cpt_get_buf(ctx); + err = ctx->pread(stats, in, ctx, (*ppos) + o.cpt_hdrlen); + if (err) + goto out; + + in /= sizeof(*stats); + if (in > n) + wprintk_ctx("SNMP stats trimmed\n"); + else + n = in; + + for (i = 0; i < n; i++) + *((unsigned long *)(per_cpu_ptr(mib[0], 0)) + i) = stats[i]; + + *ppos += o.cpt_next; + if (*ppos < endpos) + err = 1; /* go on restoring */ +out: + cpt_release_buf(ctx); + return err; +} + +static int rst_restore_snmp(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SNMP_STATS]; + loff_t endsec; + struct cpt_section_hdr h; + struct ve_struct *ve; + struct net *net; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_SNMP_STATS || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + ve = get_exec_env(); + net = ve->ve_netns; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + if (sec >= endsec) + goto out; + + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.net_statistics, + LINUX_MIB_MAX, &sec, endsec); + if (err <= 0) + goto out; + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.ip_statistics, + IPSTATS_MIB_MAX, &sec, endsec); + if (err <= 0) + goto out; + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.tcp_statistics, + TCP_MIB_MAX, &sec, endsec); + if (err <= 0) + goto out; + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.udp_statistics, + UDP_MIB_MAX, &sec, endsec); + if (err <= 0) + goto out; + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmp_statistics, + ICMP_MIB_MAX, &sec, endsec); + if (err <= 0) + goto out; + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmpmsg_statistics, + ICMPMSG_MIB_MAX, &sec, endsec); + if (err <= 0) + goto out; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.ipv6_statistics, + IPSTATS_MIB_MAX, &sec, endsec); + if (err <= 0) + goto out; + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.udp_stats_in6, + UDP_MIB_MAX, &sec, endsec); + if (err <= 0) + goto out; + err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmpv6_statistics, + ICMP6_MIB_MAX, &sec, endsec); +#endif + if (err == 1) + err = 0; +out: + return err; +} + +int rst_restore_net(struct cpt_context *ctx) +{ + int err; + + err = rst_restore_netdev(ctx); + if (!err) + err = rst_restore_ifaddr(ctx); + if (!err) + err = rst_restore_route(ctx); + if (!err) + err = rst_restore_iptables(ctx); + if (!err) + err = rst_restore_ip_conntrack(ctx); + if (!err) + err = rst_restore_snmp(ctx); + return err; +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_proc.c linux-2.6.32.48-openvz/kernel/cpt/rst_proc.c --- linux-2.6.32.48/kernel/cpt/rst_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_proc.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,582 @@ +/* + * + * kernel/cpt/rst_proc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" + +MODULE_AUTHOR("Alexey Kuznetsov "); +MODULE_LICENSE("GPL"); + +/* List of contexts and lock protecting the list */ +static struct list_head cpt_context_list; +static spinlock_t cpt_context_lock; + +static int proc_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + cpt_context_t *ctx; + + len += sprintf(buffer, "Ctx Id VE State\n"); + + spin_lock(&cpt_context_lock); + + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + len += sprintf(buffer+len,"%p %08x %-8u %d", + ctx, + ctx->contextid, + ctx->ve_id, + ctx->ctx_state + ); +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + len += pagein_info_printf(buffer+len, ctx); +#endif + + buffer[len++] = '\n'; + + pos = begin+len; + if (pos < offset) { + len = 0; + begin = pos; + } + if (pos > offset+length) + goto done; + } + *eof = 1; + +done: + spin_unlock(&cpt_context_lock); + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) + len = length; + if(len < 0) + len = 0; + return len; +} + +void rst_context_release(cpt_context_t *ctx) +{ + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + + if (ctx->ctx_state > 0) + rst_resume(ctx); + ctx->ctx_state = CPT_CTX_ERROR; + + rst_close_dumpfile(ctx); + + if (ctx->anonvmas) { + int h; + for (h = 0; h < CPT_ANONVMA_HSIZE; h++) { + while (!hlist_empty(&ctx->anonvmas[h])) { + struct hlist_node *elem = ctx->anonvmas[h].first; + hlist_del(elem); + kfree(elem); + } + } + free_page((unsigned long)ctx->anonvmas); + } + cpt_flush_error(ctx); + if (ctx->errorfile) { + fput(ctx->errorfile); + ctx->errorfile = NULL; + } + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; + } +#ifdef CONFIG_VZ_CHECKPOINT_ITER + rst_drop_iter_dir(ctx); +#endif +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + if (ctx->pgin_task) + put_task_struct(ctx->pgin_task); +#endif + if (ctx->filejob_queue) + rst_flush_filejobs(ctx); + if (ctx->vdso) + free_page((unsigned long)ctx->vdso); + if (ctx->objcount) + eprintk_ctx("%d objects leaked\n", ctx->objcount); + kfree(ctx); + + spin_lock(&cpt_context_lock); +} + +static void __cpt_context_put(cpt_context_t *ctx) +{ + if (!--ctx->refcount) + rst_context_release(ctx); +} + +static void cpt_context_put(cpt_context_t *ctx) +{ + spin_lock(&cpt_context_lock); + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); +} + +cpt_context_t * rst_context_open(void) +{ + cpt_context_t *ctx; + + if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { + rst_context_init(ctx); + spin_lock(&cpt_context_lock); + list_add_tail(&ctx->ctx_list, &cpt_context_list); + spin_unlock(&cpt_context_lock); + ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); + if (ctx->error_msg != NULL) + ctx->error_msg[0] = 0; + } + return ctx; +} + +void rst_report_error(int err, cpt_context_t *ctx) +{ + if (ctx->statusfile) { + mm_segment_t oldfs; + int status = 7 /* VZ_ENVCREATE_ERROR */; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (ctx->statusfile->f_op && ctx->statusfile->f_op->write) + ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos); + set_fs(oldfs); + fput(ctx->statusfile); + ctx->statusfile = NULL; + } +} + + +static cpt_context_t * cpt_context_lookup(unsigned int ctxid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->contextid == ctxid) { + ctx->refcount++; + spin_unlock(&cpt_context_lock); + return ctx; + } + } + spin_unlock(&cpt_context_lock); + return NULL; +} + +static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + int err = 0; + cpt_context_t *ctx; + struct file *dfile = NULL; + + unlock_kernel(); + + if (cmd == CPT_TEST_CAPS) { + err = test_cpu_caps_and_features(); + goto out_lock; + } + + if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { + cpt_context_t *old_ctx; + + ctx = NULL; + if (cmd == CPT_JOIN_CONTEXT) { + err = -ENOENT; + ctx = cpt_context_lookup(arg); + if (!ctx) + goto out_lock; + } + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + file->private_data = ctx; + + if (old_ctx) { + if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { + old_ctx->sticky = 0; + old_ctx->refcount--; + } + __cpt_context_put(old_ctx); + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_lock; + } + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + if (ctx) + ctx->refcount++; + spin_unlock(&cpt_context_lock); + + if (!ctx) { + cpt_context_t *old_ctx; + + err = -ENOMEM; + ctx = rst_context_open(); + if (!ctx) + goto out_lock; + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + if (!old_ctx) { + ctx->refcount++; + file->private_data = ctx; + } else { + old_ctx->refcount++; + } + if (old_ctx) { + __cpt_context_put(ctx); + ctx = old_ctx; + } + spin_unlock(&cpt_context_lock); + } + + if (cmd == CPT_GET_CONTEXT) { + unsigned int contextid = (unsigned int)arg; + + err = -EINVAL; + if (ctx->contextid && ctx->contextid != contextid) + goto out_nosem; + if (!ctx->contextid) { + cpt_context_t *c1 = cpt_context_lookup(contextid); + if (c1) { + cpt_context_put(c1); + err = -EEXIST; + goto out_nosem; + } + ctx->contextid = contextid; + } + spin_lock(&cpt_context_lock); + if (!ctx->sticky) { + ctx->sticky = 1; + ctx->refcount++; + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_nosem; + } + + down(&ctx->main_sem); + + err = -EBUSY; + if (ctx->ctx_state < 0) + goto out; + + err = 0; + switch (cmd) { + case CPT_SET_DUMPFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + err = -EBADF; + dfile = fget(arg); + if (dfile == NULL) + break; + if (dfile->f_op == NULL || + dfile->f_op->read == NULL) { + fput(dfile); + break; + } + err = 0; + } + if (ctx->file) + fput(ctx->file); + ctx->file = dfile; + break; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + case CPT_SET_PAGEINFDIN: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + ctx->pagein_file_in = dfile; + break; + case CPT_SET_PAGEINFDOUT: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + ctx->pagein_file_out = dfile; + break; + case CPT_PAGEIND: + err = rst_pageind(ctx); + break; +#endif +#ifdef CONFIG_VZ_CHECKPOINT_ITER + case CPT_ITER: + err = rst_iteration(ctx); + break; +#endif + case CPT_SET_LOCKFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->lockfile) + fput(ctx->lockfile); + ctx->lockfile = dfile; + break; + case CPT_SET_STATUSFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->statusfile) + fput(ctx->statusfile); + ctx->statusfile = dfile; + break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->errorfile) + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; + case CPT_HARDLNK_ON: + ctx->hardlinked_on = 1; + break; + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ve_id = arg; + break; + case CPT_UNDUMP: + if (ctx->ctx_state > 0) { + err = -ENOENT; + break; + } + ctx->ctx_state = CPT_CTX_UNDUMPING; + err = vps_rst_undump(ctx); + if (err) { + rst_report_error(err, ctx); + if (rst_kill(ctx) == 0) + ctx->ctx_state = CPT_CTX_IDLE; + } else { + ctx->ctx_state = CPT_CTX_UNDUMPED; + } + break; + case CPT_RESUME: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = rst_resume(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_KILL: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = rst_kill(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + default: + err = -EINVAL; + break; + } + +out: + cpt_flush_error(ctx); + up(&ctx->main_sem); +out_nosem: + cpt_context_put(ctx); +out_lock: + lock_kernel(); + if (err == -ERESTARTSYS || err == -ERESTARTNOINTR || + err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK) + err = -EINTR; + return err; +} + +static int rst_open(struct inode * inode, struct file * file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int rst_release(struct inode * inode, struct file * file) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + file->private_data = NULL; + if (ctx) + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); + + + module_put(THIS_MODULE); + return 0; +} + +static struct file_operations rst_fops = +{ + .owner = THIS_MODULE, + .ioctl = rst_ioctl, + .open = rst_open, + .release = rst_release, +}; + + +static struct proc_dir_entry *proc_ent; +extern void *schedule_tail_p; +extern void schedule_tail_hook(void); + +static struct ctl_table_header *ctl_header; + +static ctl_table debug_table[] = { + { + .procname = "rst", + .data = &debug_level, + .maxlen = sizeof(debug_level), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { .ctl_name = 0 } +}; + +static int __init init_rst(void) +{ + int err; + + err = -ENOMEM; + ctl_header = register_sysctl_table(root_table); + if (!ctl_header) + goto err_mon; + + spin_lock_init(&cpt_context_lock); + INIT_LIST_HEAD(&cpt_context_list); + + err = -EINVAL; + proc_ent = proc_create("rst", 0600, NULL, NULL); + if (!proc_ent) + goto err_out; + + rst_fops.read = proc_ent->proc_fops->read; + rst_fops.write = proc_ent->proc_fops->write; + rst_fops.llseek = proc_ent->proc_fops->llseek; + proc_ent->proc_fops = &rst_fops; + + proc_ent->read_proc = proc_read; + proc_ent->data = NULL; + return 0; + +err_out: + unregister_sysctl_table(ctl_header); +err_mon: + return err; +} +module_init(init_rst); + +static void __exit exit_rst(void) +{ + remove_proc_entry("rst", NULL); + unregister_sysctl_table(ctl_header); + + spin_lock(&cpt_context_lock); + while (!list_empty(&cpt_context_list)) { + cpt_context_t *ctx; + ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); + + if (!ctx->sticky) + ctx->refcount++; + ctx->sticky = 0; + + BUG_ON(ctx->refcount != 1); + + __cpt_context_put(ctx); + } + spin_unlock(&cpt_context_lock); +} +module_exit(exit_rst); diff -urNp linux-2.6.32.48/kernel/cpt/rst_process.c linux-2.6.32.48-openvz/kernel/cpt/rst_process.c --- linux-2.6.32.48/kernel/cpt/rst_process.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_process.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,1663 @@ +/* + * + * kernel/cpt/rst_process.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_ubc.h" +#include "cpt_process.h" +#include "cpt_kernel.h" + + +#define HOOK_RESERVE 256 + +struct resume_info +{ + asmlinkage void (*hook)(struct resume_info *); + unsigned long hooks; +#define HOOK_TID 0 +#define HOOK_CONT 1 +#define HOOK_LSI 2 +#define HOOK_RESTART 3 + unsigned long tid_ptrs[2]; + siginfo_t last_siginfo; +}; + +#ifdef CONFIG_X86_32 + +#define IN_SYSCALL(regs) ((long)(regs)->orig_ax >= 0) +#define IN_ERROR(regs) ((long)(regs)->ax < 0) +#define SYSCALL_ERRNO(regs) (-(long)((regs)->ax)) +#define SYSCALL_RETVAL(regs) ((regs)->ax) +#define SYSCALL_NR(regs) ((regs)->orig_ax) + +#define SYSCALL_SETRET(regs,val) do { (regs)->ax = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->ax = (new); \ + (regs)->ip -= 2; } while (0) + +#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) + +/* In new kernels task_pt_regs() is define to something inappropriate */ +#undef task_pt_regs +#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1) + +#elif defined(CONFIG_X86_64) + +#define IN_SYSCALL(regs) ((long)(regs)->orig_ax >= 0) +#define IN_ERROR(regs) ((long)(regs)->ax < 0) +#define SYSCALL_ERRNO(regs) (-(long)((regs)->ax)) +#define SYSCALL_RETVAL(regs) ((regs)->ax) +#define SYSCALL_NR(regs) ((regs)->orig_ax) + +#define SYSCALL_SETRET(regs,val) do { (regs)->ax = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->ax = (new); \ + (regs)->ip -= 2; } while (0) + +#define __NR32_restart_syscall 0 +#define __NR32_rt_sigtimedwait 177 +#define __NR32_pause 29 +#define __NR32_futex 240 + +#define syscall_is(tsk,regs,name) ((!(task_thread_info(tsk)->flags&_TIF_IA32) && \ + SYSCALL_NR(regs) == __NR_##name) || \ + ((task_thread_info(tsk)->flags&_TIF_IA32) && \ + SYSCALL_NR(regs) == __NR32_##name)) + +#elif defined (CONFIG_IA64) + +#define IN_SYSCALL(regs) ((long)(regs)->cr_ifs >= 0) +#define IN_ERROR(regs) ((long)(regs)->r10 == -1) +#define SYSCALL_ERRNO(regs) ((regs)->r10 == -1 ? (long)((regs)->r8) : 0) +#define SYSCALL_RETVAL(regs) ((regs)->r8) +#define SYSCALL_NR(regs) ((regs)->cr_ifs >= 0 ? (regs)->r15 : -1) + +#define SYSCALL_SETRET(regs,val) do { (regs)->r8 = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->r15 = (new); \ + (regs)->r10 = 0; \ + ia64_decrement_ip(regs); } while (0) + +#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) + +#else + +#error This arch is not supported + +#endif + +#define SYSCALL_RESTART(regs) SYSCALL_RESTART2(regs, SYSCALL_NR(regs)) + +pid_t vpid_to_pid(pid_t nr) +{ + pid_t vnr; + struct pid *pid; + + rcu_read_lock(); + pid = find_vpid(nr); + vnr = (pid == NULL ? -1 : pid->numbers[0].nr); + rcu_read_unlock(); + return vnr; +} + +static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si) +{ + memset(info, 0, sizeof(*info)); + switch(si->cpt_code & __SI_MASK) { + case __SI_TIMER: + info->si_tid = si->cpt_pid; + info->si_overrun = si->cpt_uid; + info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval); + info->si_sys_private = si->cpt_utime; + break; + case __SI_POLL: + info->si_band = si->cpt_pid; + info->si_fd = si->cpt_uid; + break; + case __SI_FAULT: + info->si_addr = cpt_ptr_import(si->cpt_sigval); +#ifdef __ARCH_SI_TRAPNO + info->si_trapno = si->cpt_pid; +#endif + break; + case __SI_CHLD: + info->si_pid = si->cpt_pid; + info->si_uid = si->cpt_uid; + info->si_status = si->cpt_sigval; + info->si_stime = si->cpt_stime; + info->si_utime = si->cpt_utime; + break; + case __SI_KILL: + case __SI_RT: + case __SI_MESGQ: + default: + info->si_pid = si->cpt_pid; + info->si_uid = si->cpt_uid; + info->si_ptr = cpt_ptr_import(si->cpt_sigval); + break; + } + info->si_signo = si->cpt_signo; + info->si_errno = si->cpt_errno; + info->si_code = si->cpt_code; +} + +static int restore_sigqueue(struct task_struct *tsk, + struct sigpending *queue, unsigned long start, + unsigned long end) +{ + while (start < end) { + struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start; + if (si->cpt_object == CPT_OBJ_SIGINFO) { + struct sigqueue *q = NULL; + struct user_struct *up; + + up = alloc_uid(get_exec_env()->user_ns, si->cpt_user); + if (!up) + return -ENOMEM; + q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); + if (!q) { + free_uid(up); + return -ENOMEM; + } + if (ub_siginfo_charge(q, get_exec_ub())) { + kmem_cache_free(sigqueue_cachep, q); + free_uid(up); + return -ENOMEM; + } + + INIT_LIST_HEAD(&q->list); + /* Preallocated elements (posix timers) are not + * supported yet. It is safe to replace them with + * a private one. */ + q->flags = 0; + q->user = up; + atomic_inc(&q->user->sigpending); + + decode_siginfo(&q->info, si); + list_add_tail(&q->list, &queue->list); + } + start += si->cpt_next; + } + return 0; +} + +int rst_process_linkage(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + if (tsk == NULL) { + eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm); + return -EINVAL; + } + + if (task_pgrp_vnr(tsk) != ti->cpt_pgrp) { + struct pid *pid; + + rcu_read_lock(); + pid = alloc_vpid_safe(ti->cpt_pgrp); + if (!pid) { + eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + write_lock_irq(&tasklist_lock); + detach_pid(tsk, PIDTYPE_PGID); + if (thread_group_leader(tsk)) + attach_pid(tsk, PIDTYPE_PGID, pid); + write_unlock_irq(&tasklist_lock); + + if (task_pgrp_vnr(tsk) != pid_vnr(pid)) { + eprintk_ctx("cannot set PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + rcu_read_unlock(); + } + if (task_session_vnr(tsk) != ti->cpt_session) { + struct pid *pid; + + rcu_read_lock(); + pid = alloc_vpid_safe(ti->cpt_session); + if (!pid) { + eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + write_lock_irq(&tasklist_lock); + detach_pid(tsk, PIDTYPE_SID); + if (thread_group_leader(tsk)) + attach_pid(tsk, PIDTYPE_SID, pid); + write_unlock_irq(&tasklist_lock); + + if (task_session_vnr(tsk) != pid_vnr(pid)) { + eprintk_ctx("cannot set SID " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + rcu_read_unlock(); + } + if (ti->cpt_old_pgrp > 0 && !tsk->signal->tty_old_pgrp) { + struct pid *pid; + + rcu_read_lock(); + pid = get_pid(find_vpid(ti->cpt_old_pgrp)); + if (!pid) { + eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + tsk->signal->tty_old_pgrp = pid; + rcu_read_unlock(); + } + } + + return 0; +} + +struct pid *alloc_vpid_safe(pid_t vnr) +{ + struct pid *pid; + + pid = alloc_pid(current->nsproxy->pid_ns, vnr); + if (!pid) + pid = find_vpid(vnr); + return pid; +} + +static int +restore_one_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx) +{ + int err; + struct cpt_signal_image *si = cpt_get_buf(ctx); + + current->signal->tty = NULL; + + err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + +#if 0 /* this should have been restored in rst_process_linkage */ + if (task_pgrp_vnr(current) != si->cpt_pgrp) { + struct pid * pid = NULL, *free = NULL; + + rcu_read_lock(); + if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) { +#if 0 + if (!is_virtual_pid(si->cpt_pgrp)) { + eprintk_ctx("external process group " CPT_FID, CPT_TID(current)); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + pid = alloc_vpid_safe(si->cpt_pgrp); + free = pid; + } + write_lock_irq(&tasklist_lock); + if (pid != NULL) { + if (task_pgrp_nr(current) != pid_nr(pid)) { + detach_pid(current, PIDTYPE_PGID); + if (thread_group_leader(current)) { + attach_pid(current, PIDTYPE_PGID, pid); + free = NULL; + } + } + } + write_unlock_irq(&tasklist_lock); + if (free != NULL) + free_pid(free); + rcu_read_unlock(); + } +#endif + + current->signal->tty_old_pgrp = NULL; + if ((int)si->cpt_old_pgrp > 0) { + if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) { + current->signal->tty_old_pgrp = + alloc_pid(current->nsproxy->pid_ns, 0); + if (!current->signal->tty_old_pgrp) { + eprintk_ctx("failed to allocate stray tty_old_pgrp\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + } else { + rcu_read_lock(); + current->signal->tty_old_pgrp = + get_pid(alloc_vpid_safe(si->cpt_old_pgrp)); + rcu_read_unlock(); + if (!current->signal->tty_old_pgrp) { + dprintk_ctx("forward old tty PGID\n"); + current->signal->tty_old_pgrp = NULL; + } + } + } + +#if 0 /* this should have been restored in rst_process_linkage */ + if (task_session_vnr(current) != si->cpt_session) { + struct pid * pid = NULL, *free = NULL; + + rcu_read_lock(); + if (si->cpt_session_type == CPT_PGRP_ORPHAN) { +#if 0 + if (!is_virtual_pid(si->cpt_session)) { + eprintk_ctx("external process session " CPT_FID, CPT_TID(current)); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + pid = alloc_vpid_safe(si->cpt_session); + free = pid; + } + write_lock_irq(&tasklist_lock); + if (pid == NULL) + pid = find_vpid(si->cpt_session); + if (pid != NULL) { + if (task_session_nr(current) != pid_nr(pid)) { + detach_pid(current, PIDTYPE_SID); + set_task_session(current, pid_nr(pid)); + if (thread_group_leader(current)) { + attach_pid(current, PIDTYPE_SID, pid); + free = NULL; + } + } + } + write_unlock_irq(&tasklist_lock); + if (free != NULL) + free_pid(free); + rcu_read_unlock(); + } +#endif + + cpt_sigset_import(¤t->signal->shared_pending.signal, si->cpt_sigpending); + current->signal->leader = si->cpt_leader; + if (si->cpt_ctty != CPT_NULL) { + cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx); + if (obj) { + struct tty_struct *tty = obj->o_obj; + if (!tty->session || tty->session == + task_session(current)) { + put_pid(tty->session); + tty->session = get_pid(task_session(current)); + tty_kref_put(current->signal->tty); + current->signal->tty = tty_kref_get(tty); + } else { + wprintk_ctx("tty session mismatch\n"); + } + } + } + + if (si->cpt_curr_target) { + current->signal->curr_target = find_task_by_vpid(si->cpt_curr_target); + if (current->signal->curr_target == NULL) { + wprintk_ctx("oops, curr_target=NULL, pid=%u\n", si->cpt_curr_target); + current->signal->curr_target = current; + } + } + current->signal->flags = 0; + *exiting = si->cpt_group_exit; + current->signal->group_exit_code = si->cpt_group_exit_code; + if (si->cpt_group_exit_task) { + current->signal->group_exit_task = find_task_by_vpid(si->cpt_group_exit_task); + if (current->signal->group_exit_task == NULL) { + eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task); + cpt_release_buf(ctx); + return -EINVAL; + } + } + current->signal->notify_count = si->cpt_notify_count; + current->signal->group_stop_count = si->cpt_group_stop_count; + + if (si->cpt_next > si->cpt_hdrlen) { + char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL); + if (buf == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx, + ti->cpt_signal + si->cpt_hdrlen); + if (err) { + kfree(buf); + cpt_release_buf(ctx); + return err; + } + restore_sigqueue(current, + ¤t->signal->shared_pending, (unsigned long)buf, + (unsigned long)buf + si->cpt_next - si->cpt_hdrlen); + kfree(buf); + } + cpt_release_buf(ctx); + return 0; +} + +int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err; + struct cpt_sighand_image si; + int i; + loff_t pos, endpos; + + err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx); + if (err) + return err; + + for (i=0; i<_NSIG; i++) { + current->sighand->action[i].sa.sa_handler = SIG_DFL; +#ifndef CONFIG_IA64 + current->sighand->action[i].sa.sa_restorer = 0; +#endif + current->sighand->action[i].sa.sa_flags = 0; + memset(¤t->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t)); + } + + pos = ti->cpt_sighand + si.cpt_hdrlen; + endpos = ti->cpt_sighand + si.cpt_next; + while (pos < endpos) { + struct cpt_sighandler_image shi; + + err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx); + if (err) + return err; + current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler; +#ifndef CONFIG_IA64 + current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer; +#endif + current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags; + cpt_sigset_import(¤t->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask); + pos += shi.cpt_next; + } + + return 0; +} + + +__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + + if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx)) + flag |= CLONE_THREAD; + if (ti->cpt_sighand == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx)) + flag |= CLONE_SIGHAND; + return flag; +} + +int +rst_signal_complete(struct cpt_task_image *ti, int * exiting, cpt_context_t *ctx) +{ + int err; + cpt_object_t *obj; + + if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) { + return -EINVAL; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx); + if (obj) { + struct sighand_struct *sig = current->sighand; + if (obj->o_obj != sig) { + return -EINVAL; + } + } else { + obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setpos(obj, ti->cpt_sighand, ctx); + err = restore_one_sighand_struct(ti, ctx); + if (err) + return err; + } + + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx); + if (obj) { + struct signal_struct *sig = current->signal; + if (obj->o_obj != sig) { + return -EINVAL; + } +/* if (current->signal) { + pid_t session; + + session = process_session(current); + set_process_vgroup(current, session); + set_signal_vsession(current->signal, session); + }*/ + } else { + obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setpos(obj, ti->cpt_signal, ctx); + err = restore_one_signal_struct(ti, exiting, ctx); + if (err) + return err; + } + + return 0; +} + +#ifdef CONFIG_X86 +static u32 decode_segment(u32 segid) +{ + if (segid == CPT_SEG_ZERO) + return 0; + + /* TLS descriptors */ + if (segid <= CPT_SEG_TLS3) + return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3; + + /* LDT descriptor, it is just an index to LDT array */ + if (segid >= CPT_SEG_LDT) + return ((segid - CPT_SEG_LDT) << 3) | 7; + + /* Check for one of standard descriptors */ +#ifdef CONFIG_X86_64 + if (segid == CPT_SEG_USER32_DS) + return __USER32_DS; + if (segid == CPT_SEG_USER32_CS) + return __USER32_CS; + if (segid == CPT_SEG_USER64_DS) + return __USER_DS; + if (segid == CPT_SEG_USER64_CS) + return __USER_CS; +#else + if (segid == CPT_SEG_USER32_DS) + return __USER_DS; + if (segid == CPT_SEG_USER32_CS) + return __USER_CS; +#endif + wprintk("Invalid segment reg %d\n", segid); + return 0; +} +#endif + +#if defined (CONFIG_IA64) +void ia64_decrement_ip (struct pt_regs *regs) +{ + unsigned long w0, ri = ia64_psr(regs)->ri - 1; + + if (ia64_psr(regs)->ri == 0) { + regs->cr_iip -= 16; + ri = 2; + get_user(w0, (char __user *) regs->cr_iip + 0); + if (((w0 >> 1) & 0xf) == 2) { + /* + * rfi'ing to slot 2 of an MLX bundle causes + * an illegal operation fault. We don't want + * that to happen... + */ + ri = 1; + } + } + ia64_psr(regs)->ri = ri; +} +#endif + +static void rst_child_tid(unsigned long *child_tids) +{ + dprintk("rct: " CPT_FID "\n", CPT_TID(current)); + current->clear_child_tid = (void*)child_tids[0]; + current->set_child_tid = (void*)child_tids[1]; +} + +static void rst_last_siginfo(void) +{ + int signr; + siginfo_t *info = current->last_siginfo; + struct pt_regs *regs = task_pt_regs(current); + struct k_sigaction *ka; + int ptrace_id; + + dprintk("rlsi: " CPT_FID "\n", CPT_TID(current)); + + spin_lock_irq(¤t->sighand->siglock); + current->last_siginfo = NULL; + recalc_sigpending(); + + ptrace_id = current->pn_state; + clear_pn_state(current); + + switch (ptrace_id) { + case PN_STOP_TF: + case PN_STOP_TF_RT: + /* frame_*signal */ + dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %u %lu\n", + task_pid_vnr(current), current->pid, current->comm, + info->si_signo, info->si_code, + current->exit_code, SYSCALL_NR(regs), + current->ptrace, current->ptrace_message); + goto out; + case PN_STOP_ENTRY: + case PN_STOP_LEAVE: + /* do_syscall_trace */ + spin_unlock_irq(¤t->sighand->siglock); + dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code); + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } + if (IN_SYSCALL(regs)) { + if (ptrace_id == PN_STOP_ENTRY +#ifdef CONFIG_X86 + && SYSCALL_ERRNO(regs) == ENOSYS +#endif + ) + SYSCALL_RESTART(regs); + else if (IN_ERROR(regs) && + syscall_is(current, regs, rt_sigtimedwait) && + (SYSCALL_ERRNO(regs) == EAGAIN || + SYSCALL_ERRNO(regs) == EINTR)) + SYSCALL_RESTART(regs); + } + return; + case PN_STOP_FORK: + /* fork */ + SYSCALL_SETRET(regs, current->ptrace_message); + dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs)); + goto out; + case PN_STOP_VFORK: + /* after vfork */ + SYSCALL_SETRET(regs, current->ptrace_message); + dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs)); + goto out; + case PN_STOP_SIGNAL: + /* normal case : dequeue signal */ + break; + case PN_STOP_EXIT: + dprintk("ptrace exit caught\n"); + current->ptrace &= ~PT_TRACE_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + module_put(THIS_MODULE); + complete_and_exit(NULL, current->ptrace_message); + BUG(); + case PN_STOP_EXEC: + eprintk("ptrace after exec caught: must not happen\n"); + BUG(); + default: + eprintk("ptrace with unknown identity %d\n", ptrace_id); + BUG(); + } + + signr = current->exit_code; + if (signr == 0) { + dprintk("rlsi: canceled signal %d\n", info->si_signo); + goto out; + } + current->exit_code = 0; + + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_pid_vnr(current->parent); + info->si_uid = current->parent->cred->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + dprintk("going to requeue signal %d\n", signr); + goto out_resend_sig; + } + + ka = ¤t->sighand->action[signr-1]; + if (ka->sa.sa_handler == SIG_IGN) { + dprintk("going to resend signal %d (ignored)\n", signr); + goto out; + } + if (ka->sa.sa_handler != SIG_DFL) { + dprintk("going to resend signal %d (not SIG_DFL)\n", signr); + goto out_resend_sig; + } + if (signr == SIGCONT || + signr == SIGCHLD || + signr == SIGWINCH || + signr == SIGURG || + current->pid == 1) + goto out; + + /* All the rest, which we cannot handle are requeued. */ + dprintk("going to resend signal %d (sigh)\n", signr); +out_resend_sig: + spin_unlock_irq(¤t->sighand->siglock); + send_sig_info(signr, info, current); + return; + +out: + spin_unlock_irq(¤t->sighand->siglock); +} + +static void rst_finish_stop(void) +{ + /* ... + * do_signal() -> + * get_signal_to_deliver() -> + * do_signal_stop() -> + * finish_stop() + * + * Normally after SIGCONT it will dequeue the next signal. If no signal + * is found, do_signal restarts syscall unconditionally. + * Otherwise signal handler is pushed on user stack. + */ + + dprintk("rfs: " CPT_FID "\n", CPT_TID(current)); + + clear_stop_state(current); + current->exit_code = 0; +} + +static void rst_restart_sys(void) +{ + struct pt_regs *regs = task_pt_regs(current); + + /* This hook is supposed to be executed, when we have + * to complete some interrupted syscall. + */ + dprintk("rrs: " CPT_FID "\n", CPT_TID(current)); + + if (!IN_SYSCALL(regs) || !IN_ERROR(regs)) + return; + +#ifdef __NR_pause + if (syscall_is(current,regs,pause)) { + if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + } else +#else + /* On this arch pause() is simulated with sigsuspend(). */ + if (syscall_is(current,regs,rt_sigsuspend)) { + if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + } else +#endif + if (syscall_is(current,regs,rt_sigtimedwait)) { + if (SYSCALL_ERRNO(regs) == EAGAIN || + SYSCALL_ERRNO(regs) == EINTR) { + SYSCALL_RESTART(regs); + } + } else if (syscall_is(current,regs,futex)) { + if (SYSCALL_ERRNO(regs) == EINTR && + !signal_pending(current)) { + SYSCALL_RESTART(regs); + } + } + + if (!signal_pending(current)) { + if (SYSCALL_ERRNO(regs) == ERESTARTSYS || + SYSCALL_ERRNO(regs) == ERESTARTNOINTR || + SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + SYSCALL_RESTART(regs); + } else if (SYSCALL_ERRNO(regs) == ERESTART_RESTARTBLOCK) { + int new = __NR_restart_syscall; +#ifdef CONFIG_X86_64 + if (task_thread_info(current)->flags&_TIF_IA32) + new = __NR32_restart_syscall; +#endif + SYSCALL_RESTART2(regs, new); + } + } +} + +#ifdef CONFIG_X86_32 + +static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, + struct cpt_task_image *ti, struct cpt_x86_regs *b, + struct resume_info **rip, struct cpt_context *ctx) +{ + extern char i386_ret_from_resume; + + if (b->cpt_object != CPT_OBJ_X86_REGS) + return -EINVAL; + + tsk->thread.sp = (unsigned long) regs; + tsk->thread.sp0 = (unsigned long) (regs+1); + tsk->thread.ip = (unsigned long) &i386_ret_from_resume; + + tsk->thread.gs = decode_segment(b->cpt_gs); + task_user_gs(tsk) = decode_segment(b->cpt_ugs); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + regs->bx = b->cpt_ebx; + regs->cx = b->cpt_ecx; + regs->dx = b->cpt_edx; + regs->si = b->cpt_esi; + regs->di = b->cpt_edi; + regs->bp = b->cpt_ebp; + regs->ax = b->cpt_eax; + regs->ds = b->cpt_xds; + regs->es = b->cpt_xes; + regs->orig_ax = b->cpt_orig_eax; + regs->ip = b->cpt_eip; + regs->cs = b->cpt_xcs; + regs->flags = b->cpt_eflags; + regs->sp = b->cpt_esp; + regs->ss = b->cpt_xss; + + regs->cs = decode_segment(b->cpt_xcs); + regs->ss = decode_segment(b->cpt_xss); + regs->ds = decode_segment(b->cpt_xds); + regs->es = decode_segment(b->cpt_xes); + regs->fs = decode_segment(b->cpt_fs); + + tsk->thread.sp -= HOOK_RESERVE; + memset((void*)tsk->thread.sp, 0, HOOK_RESERVE); + *rip = (void*)tsk->thread.sp; + + return 0; +} + +#elif defined(CONFIG_X86_64) + +static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s) +{ + memset(d, 0, sizeof(struct pt_regs)); + d->bp = s->cpt_ebp; + d->bx = s->cpt_ebx; + d->ax = (s32)s->cpt_eax; + d->cx = s->cpt_ecx; + d->dx = s->cpt_edx; + d->si = s->cpt_esi; + d->di = s->cpt_edi; + d->orig_ax = (s32)s->cpt_orig_eax; + d->ip = s->cpt_eip; + d->cs = s->cpt_xcs; + d->flags = s->cpt_eflags; + d->sp = s->cpt_esp; + d->ss = s->cpt_xss; +} + +static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, + struct cpt_task_image *ti, struct cpt_obj_bits *hdr, + struct resume_info **rip, struct cpt_context *ctx) +{ + if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) { + struct cpt_x86_64_regs *b = (void*)hdr; + + tsk->thread.sp = (unsigned long) regs; + tsk->thread.sp0 = (unsigned long) (regs+1); + + tsk->thread.fs = b->cpt_fsbase; + tsk->thread.gs = b->cpt_gsbase; + tsk->thread.fsindex = decode_segment(b->cpt_fsindex); + tsk->thread.gsindex = decode_segment(b->cpt_gsindex); + tsk->thread.ds = decode_segment(b->cpt_ds); + tsk->thread.es = decode_segment(b->cpt_es); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs)); + + tsk->thread.usersp = regs->sp; + regs->cs = decode_segment(b->cpt_cs); + regs->ss = decode_segment(b->cpt_ss); + } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) { + struct cpt_x86_regs *b = (void*)hdr; + + tsk->thread.sp = (unsigned long) regs; + tsk->thread.sp0 = (unsigned long) (regs+1); + + tsk->thread.fs = 0; + tsk->thread.gs = 0; + tsk->thread.fsindex = decode_segment(b->cpt_fs); + tsk->thread.gsindex = decode_segment(b->cpt_ugs); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + xlate_ptregs_32_to_64(regs, b); + + tsk->thread.usersp = regs->sp; + regs->cs = decode_segment(b->cpt_xcs); + regs->ss = decode_segment(b->cpt_xss); + tsk->thread.ds = decode_segment(b->cpt_xds); + tsk->thread.es = decode_segment(b->cpt_xes); + } else { + return -EINVAL; + } + + tsk->thread.sp -= HOOK_RESERVE; + memset((void*)tsk->thread.sp, 0, HOOK_RESERVE); + *rip = (void*)tsk->thread.sp; + return 0; +} + +#elif defined(CONFIG_IA64) + +#define MASK(nbits) ((1UL << (nbits)) - 1) /* mask with NBITS bits set */ + +#define PUT_BITS(first, last, nat) \ + ({ \ + unsigned long bit = ia64_unat_pos(&pt->r##first); \ + unsigned long nbits = (last - first + 1); \ + unsigned long mask = MASK(nbits) << first; \ + long dist; \ + if (bit < first) \ + dist = 64 + bit - first; \ + else \ + dist = bit - first; \ + ia64_rotl(nat & mask, dist); \ + }) + +unsigned long +ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat) +{ + unsigned long scratch_unat; + + /* + * Registers that are stored consecutively in struct pt_regs + * can be handled in parallel. If the register order in + * struct_pt_regs changes, this code MUST be updated. + */ + scratch_unat = PUT_BITS( 1, 1, nat); + scratch_unat |= PUT_BITS( 2, 3, nat); + scratch_unat |= PUT_BITS(12, 13, nat); + scratch_unat |= PUT_BITS(14, 14, nat); + scratch_unat |= PUT_BITS(15, 15, nat); + scratch_unat |= PUT_BITS( 8, 11, nat); + scratch_unat |= PUT_BITS(16, 31, nat); + + return scratch_unat; + +} + +static unsigned long +ia64_put_saved_nat_bits (struct switch_stack *pt, unsigned long nat) +{ + unsigned long scratch_unat; + + scratch_unat = PUT_BITS( 4, 7, nat); + + return scratch_unat; + +} + +#undef PUT_BITS + + +static int restore_registers(struct task_struct *tsk, struct pt_regs *pt, + struct cpt_task_image *ti, + struct cpt_ia64_regs *r, + struct resume_info **rip, + struct cpt_context *ctx) +{ + extern char ia64_ret_from_resume; + struct switch_stack *sw; + struct resume_info *ri; + struct ia64_psr *psr = ia64_psr(pt); + void *krbs = (void *)tsk + IA64_RBS_OFFSET; + unsigned long reg; + + if (r->cpt_object != CPT_OBJ_IA64_REGS) + return -EINVAL; + + if (r->num_regs > 96) { + eprintk(CPT_FID " too much RSE regs %lu\n", + CPT_TID(tsk), r->num_regs); + return -EINVAL; + } + + *rip = ri = ((void*)pt) - HOOK_RESERVE; + sw = ((struct switch_stack *) ri) - 1; + + memmove(sw, (void*)tsk->thread.ksp + 16, sizeof(struct switch_stack)); + memset(ri, 0, HOOK_RESERVE); + + /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ + memcpy(&pt->r1, &r->gr[1], 8*(2-1)); + memcpy(&pt->r2, &r->gr[2], 8*(4-2)); + memcpy(&pt->r8, &r->gr[8], 8*(12-8)); + memcpy(&pt->r12, &r->gr[12], 8*(14-12)); + memcpy(&pt->r14, &r->gr[14], 8*(15-14)); + memcpy(&pt->r15, &r->gr[15], 8*(16-15)); + memcpy(&pt->r16, &r->gr[16], 8*(32-16)); + + pt->b0 = r->br[0]; + pt->b6 = r->br[6]; + pt->b7 = r->br[7]; + + pt->ar_bspstore = r->ar_bspstore; + pt->ar_unat = r->ar_unat; + pt->ar_pfs = r->ar_pfs; + pt->ar_ccv = r->ar_ccv; + pt->ar_fpsr = r->ar_fpsr; + pt->ar_csd = r->ar_csd; + pt->ar_ssd = r->ar_ssd; + pt->ar_rsc = r->ar_rsc; + + pt->cr_iip = r->cr_iip; + pt->cr_ipsr = r->cr_ipsr; + + pt->pr = r->pr; + + pt->cr_ifs = r->cfm; + + /* fpregs 6..9,10..11 are in pt_regs */ + memcpy(&pt->f6, &r->fr[2*6], 16*(10-6)); + memcpy(&pt->f10, &r->fr[2*10], 16*(12-10)); + /* fpreg 12..15 are on switch stack */ + memcpy(&sw->f12, &r->fr[2*12], 16*(16-12)); + /* fpregs 32...127 */ + tsk->thread.flags |= IA64_THREAD_FPH_VALID; + memcpy(tsk->thread.fph, &r->fr[32*2], 16*(128-32)); + ia64_drop_fpu(tsk); + psr->dfh = 1; + + memcpy(&sw->r4, &r->gr[4], 8*(8-4)); + memcpy(&sw->b1, &r->br[1], 8*(6-1)); + sw->ar_lc = r->ar_lc; + + memcpy(&sw->f2, &r->fr[2*2], 16*(6-2)); + memcpy(&sw->f16, &r->fr[2*16], 16*(32-16)); + + sw->caller_unat = 0; + sw->ar_fpsr = pt->ar_fpsr; + sw->ar_unat = 0; + if (r->nat[0] & 0xFFFFFF0FUL) + sw->caller_unat = ia64_put_scratch_nat_bits(pt, r->nat[0]); + if (r->nat[0] & 0xF0) + sw->ar_unat = ia64_put_saved_nat_bits(sw, r->nat[0]); + + sw->ar_bspstore = (unsigned long)ia64_rse_skip_regs(krbs, r->num_regs); + memset(krbs, 0, (void*)sw->ar_bspstore - krbs); + sw->ar_rnat = 0; + sw->ar_pfs = 0; + + /* This is tricky. When we are in syscall, we have frame + * of output register (sometimes, plus one input reg sometimes). + * It is not so easy to restore such frame, RSE optimizes + * and does not fetch those regs from backstore. So, we restore + * the whole frame as local registers, and then repartition it + * in ia64_ret_from_resume(). + */ + if ((long)pt->cr_ifs >= 0) { + unsigned long out = (r->cfm&0x7F) - ((r->cfm>>7)&0x7F); + sw->ar_pfs = out | (out<<7); + } + if (r->ar_ec) + sw->ar_pfs |= (r->ar_ec & 0x3F) << 52; + + for (reg = 0; reg < r->num_regs; reg++) { + unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); + unsigned long *rnatp; + unsigned long set_rnat = 0; + + *ptr = r->gr[32+reg]; + + if (reg < 32) + set_rnat = (r->nat[0] & (1UL<<(reg+32))); + else + set_rnat = (r->nat[1] & (1UL<<(reg-32))); + + if (set_rnat) { + rnatp = ia64_rse_rnat_addr(ptr); + if ((unsigned long)rnatp >= sw->ar_bspstore) + rnatp = &sw->ar_rnat; + *rnatp |= (1UL<b0 = (unsigned long) &ia64_ret_from_resume; + tsk->thread.ksp = (unsigned long) sw - 16; + +#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */ +#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */ +#define PRED_USER_STACK 3 /* returning to user-stacks? */ +#define PRED_SYSCALL 4 /* inside a system call? */ +#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */ + + pt->loadrs = r->loadrs; + sw->pr = 0; + sw->pr &= ~(1UL << PRED_LEAVE_SYSCALL); + sw->pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_NON_SYSCALL)); + sw->pr &= ~(1UL << PRED_KERNEL_STACK); + sw->pr |= (1UL << PRED_USER_STACK); + if ((long)pt->cr_ifs < 0) { + sw->pr |= (1UL << PRED_NON_SYSCALL); + } else { + sw->pr |= ((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL)); + } + + return 0; +} +#endif + +asmlinkage void rst_resume_work(struct resume_info *ri) +{ + if (ri->hooks & (1<tid_ptrs); + if (ri->hooks & (1<hooks & (1<hooks & (1<thread.xstate->fxsave.mxcsr &= 0x0000ffbf; +#endif +} + +#ifdef CONFIG_X86 +#include +#endif + +#define RLIM_INFINITY32 0xffffffff +#define RLIM_INFINITY64 (~0ULL) + +#ifdef CONFIG_X86_64 +#define rst_rlim_32_to_64(a, i, t, im) \ +do { \ + if (im->cpt_rlim_##a[i] == RLIM_INFINITY32) \ + t->signal->rlim[i].rlim_##a = RLIM_INFINITY64; \ + else \ + t->signal->rlim[i].rlim_##a = im->cpt_rlim_##a[i]; \ +} while (0) +#elif defined(CONFIG_X86_32) +#define rst_rlim_64_to_32(a, i, t, im) \ +do { \ + if (im->cpt_rlim_##a[i] == RLIM_INFINITY64) \ + t->signal->rlim[i].rlim_##a = RLIM_INFINITY32; \ + else if (im->cpt_rlim_##a[i] > RLIM_INFINITY32) { \ + eprintk_ctx("rlimit %Lu is too high for 32-bit task, " \ + "dump file is corrupted\n", \ + im->cpt_rlim_##a[i]); \ + return -EINVAL; \ + } else \ + t->signal->rlim[i].rlim_##a = im->cpt_rlim_##a[i]; \ +} while (0) +#endif + +int rst_restore_process(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + struct pt_regs * regs; + struct cpt_object_hdr *b; + struct cpt_siginfo_image *lsi = NULL; + struct resume_info *ri = NULL; + int i; + int err = 0; +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter *tbc; + struct user_beancounter *new_bc, *old_bc; +#endif + + if (tsk == NULL) { + eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm); + return -EFAULT; + } + + wait_task_inactive(tsk, 0); +#ifdef CONFIG_BEANCOUNTERS + tbc = &tsk->task_bc; + new_bc = rst_lookup_ubc(ti->cpt_exec_ub, ctx); + err = virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTTSK, new_bc); + if (err & NOTIFY_FAIL) { + put_beancounter(new_bc); + return -ECHRNG; + } + old_bc = tbc->exec_ub; + if ((err & VIRTNOTIFY_CHANGE) && old_bc != new_bc) { + dprintk(" *** replacing ub %p by %p for %p (%d %s)\n", + old_bc, new_bc, tsk, + tsk->pid, tsk->comm); + tbc->exec_ub = new_bc; + new_bc = old_bc; + } + put_beancounter(new_bc); +#endif + regs = task_pt_regs(tsk); + + if (!tsk->exit_state) { + tsk->lock_depth = -1; +#ifdef CONFIG_PREEMPT + task_thread_info(tsk)->preempt_count--; +#endif + } + + if (tsk->static_prio != ti->cpt_static_prio) + set_user_nice(tsk, PRIO_TO_NICE((s32)ti->cpt_static_prio)); + + cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked); + cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked); + cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked); + cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending); + +#ifdef CONFIG_IA64 + SET_UNALIGN_CTL(tsk, ti->cpt_prctl_uac); + SET_FPEMU_CTL(tsk, ti->cpt_prctl_fpemu); +#endif + tsk->did_exec = (ti->cpt_did_exec != 0); + tsk->utime = ti->cpt_utime; + tsk->stime = ti->cpt_stime; + if (ctx->image_version == CPT_VERSION_8) + tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC); + else + cpt_timespec_import(&tsk->start_time, ti->cpt_starttime); + _set_normalized_timespec(&tsk->start_time, + tsk->start_time.tv_sec + + VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_sec, + tsk->start_time.tv_nsec + + VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_nsec); + + tsk->nvcsw = ti->cpt_nvcsw; + tsk->nivcsw = ti->cpt_nivcsw; + tsk->min_flt = ti->cpt_min_flt; + tsk->maj_flt = ti->cpt_maj_flt; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) + tsk->cutime = ti->cpt_cutime; + tsk->cstime = ti->cpt_cstime; + tsk->cnvcsw = ti->cpt_cnvcsw; + tsk->cnivcsw = ti->cpt_cnivcsw; + tsk->cmin_flt = ti->cpt_cmin_flt; + tsk->cmaj_flt = ti->cpt_cmaj_flt; + + BUILD_BUG_ON(RLIM_NLIMITS > CPT_RLIM_NLIMITS); + + for (i=0; irlim[i].rlim_cur = ti->cpt_rlim_cur[i]; + tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i]; + } +#else + if (thread_group_leader(tsk) && tsk->signal) { + tsk->signal->utime = ti->cpt_utime; + tsk->signal->stime = ti->cpt_stime; + tsk->signal->cutime = ti->cpt_cutime; + tsk->signal->cstime = ti->cpt_cstime; + tsk->signal->nvcsw = ti->cpt_nvcsw; + tsk->signal->nivcsw = ti->cpt_nivcsw; + tsk->signal->cnvcsw = ti->cpt_cnvcsw; + tsk->signal->cnivcsw = ti->cpt_cnivcsw; + tsk->signal->min_flt = ti->cpt_min_flt; + tsk->signal->maj_flt = ti->cpt_maj_flt; + tsk->signal->cmin_flt = ti->cpt_cmin_flt; + tsk->signal->cmaj_flt = ti->cpt_cmaj_flt; + + for (i=0; iimage_arch == CPT_OS_ARCH_I386) { + rst_rlim_32_to_64(cur, i, tsk, ti); + rst_rlim_32_to_64(max, i, tsk, ti); + } else +#elif defined(CONFIG_X86_32) + if (ctx->image_arch == CPT_OS_ARCH_EMT64) { + rst_rlim_64_to_32(cur, i, tsk, ti); + rst_rlim_64_to_32(max, i, tsk, ti); + } else +#endif + { + tsk->signal->rlim[i].rlim_cur = + ti->cpt_rlim_cur[i]; + tsk->signal->rlim[i].rlim_max = + ti->cpt_rlim_max[i]; + } + } + } +#endif + +#ifdef CONFIG_X86 + for (i=0; i<3; i++) { + if (i >= GDT_ENTRY_TLS_ENTRIES) { + eprintk_ctx("too many tls descs\n"); + } else { + tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF; + tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32; + } + } +#endif + + clear_stopped_child_used_math(tsk); + + b = (void *)(ti+1); + while ((void*)b < ((void*)ti) + ti->cpt_next) { + /* Siginfo objects are at the end of obj array */ + if (b->cpt_object == CPT_OBJ_SIGINFO) { + struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); + restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next); + set_exec_env(env); + break; + } + + switch (b->cpt_object) { +#ifdef CONFIG_X86 + case CPT_OBJ_BITS: + if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE && + cpu_has_fxsr) { + if (init_fpu(tsk)) + return -ENOMEM; + memcpy(tsk->thread.xstate, + (void*)b + b->cpt_hdrlen, + sizeof(struct i387_fxsave_struct)); + rst_apply_mxcsr_mask(tsk); + if (ti->cpt_used_math) + set_stopped_child_used_math(tsk); + } +#ifndef CONFIG_X86_64 + else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD && + !cpu_has_fxsr) { + if (init_fpu(tsk)) + return -ENOMEM; + memcpy(tsk->thread.xstate, + (void*)b + b->cpt_hdrlen, + sizeof(struct i387_fsave_struct)); + if (ti->cpt_used_math) + set_stopped_child_used_math(tsk); + } +#endif + break; +#endif + case CPT_OBJ_LASTSIGINFO: + lsi = (void*)b; + break; + case CPT_OBJ_X86_REGS: + case CPT_OBJ_X86_64_REGS: + case CPT_OBJ_IA64_REGS: + if (restore_registers(tsk, regs, ti, (void*)b, &ri, ctx)) { + eprintk_ctx("cannot restore registers: image is corrupted\n"); + return -EINVAL; + } + break; + case CPT_OBJ_SIGALTSTACK: { + struct cpt_sigaltstack_image *sas; + sas = (struct cpt_sigaltstack_image *)b; + tsk->sas_ss_sp = sas->cpt_stack; + tsk->sas_ss_size = sas->cpt_stacksize; + break; + } + case CPT_OBJ_TASK_AUX: { + struct cpt_task_aux_image *ai; + ai = (struct cpt_task_aux_image *)b; + tsk->robust_list = cpt_ptr_import(ai->cpt_robust_list); +#ifdef CONFIG_X86_64 +#ifdef CONFIG_COMPAT + if (task_thread_info(tsk)->flags&_TIF_IA32) { + tsk->robust_list = (void __user *)NULL; + tsk->compat_robust_list = cpt_ptr_import(ai->cpt_robust_list); + } +#endif +#endif + break; + } + } + b = ((void*)b) + b->cpt_next; + } + + if (ri == NULL && !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + eprintk_ctx("missing register info\n"); + return -EINVAL; + } + + if (ti->cpt_ppid != ti->cpt_rppid) { + struct task_struct *parent; + struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); + write_lock_irq(&tasklist_lock); + parent = find_task_by_vpid(ti->cpt_ppid); + if (parent && parent != tsk->parent) { + list_add(&tsk->ptrace_entry, &tsk->parent->ptraced); + /* + * Ptraced kids are no longer in the parent children + * remove_parent(tsk); + * tsk->parent = parent; + * add_parent(tsk); + */ + } + write_unlock_irq(&tasklist_lock); + set_exec_env(env); + } + + tsk->ptrace_message = ti->cpt_ptrace_message; + tsk->pn_state = ti->cpt_pn_state; + tsk->stopped_state = ti->cpt_stopped_state; + task_thread_info(tsk)->flags = ti->cpt_thrflags; + + /* The image was created with kernel < 2.6.16, while + * task hanged in sigsuspend -> do_signal. + * + * FIXME! This needs more brain efforts... + */ + if (ti->cpt_sigsuspend_state) { + set_restore_sigmask(); + } + +#ifdef CONFIG_X86_64 + task_thread_info(tsk)->flags |= _TIF_FORK | _TIF_RESUME; + if (!ti->cpt_64bit) + task_thread_info(tsk)->flags |= _TIF_IA32; +#endif + +#ifdef CONFIG_X86_32 + do { + if (regs->orig_ax == __NR__newselect && regs->di) { + struct timeval tv; + if (access_process_vm(tsk, regs->di, &tv, + sizeof(tv), 0) != sizeof(tv)) { + wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + regs->di); + break; + } + dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + tv.tv_sec, tv.tv_usec); + tv.tv_sec -= ctx->delta_time.tv_sec; + if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { + tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; + tv.tv_sec--; + } else { + tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; + } + if (tv.tv_sec < 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } + dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + tv.tv_sec, tv.tv_usec); + if (access_process_vm(tsk, regs->di, &tv, + sizeof(tv), 1) != sizeof(tv)) { + wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, regs->di); + } + + } else if (regs->orig_ax == __NR_select && regs->di) { + struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; + } a; + struct timeval tv; + if (access_process_vm(tsk, regs->bx, &a, + sizeof(a), 0) != sizeof(a)) { + wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid); + break; + } + if (access_process_vm(tsk, (unsigned long)a.tvp, + &tv, sizeof(tv), 0) != sizeof(tv)) { + wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid); + break; + } + dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n", + tsk->pid, tv.tv_sec, tv.tv_usec); + tv.tv_sec -= ctx->delta_time.tv_sec; + if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { + tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; + tv.tv_sec--; + } else { + tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; + } + if (tv.tv_sec < 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } + dprintk_ctx("task %d: New timeval in select: %ld.%ld\n", + tsk->pid, tv.tv_sec, tv.tv_usec); + if (access_process_vm(tsk, (unsigned long)a.tvp, + &tv, sizeof(tv), 1) != sizeof(tv)) { + wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid); + } + } + } while (0); +#endif + + if (ri && IN_SYSCALL(regs) && IN_ERROR(regs)) { + switch (SYSCALL_ERRNO(regs)) { + case ERESTARTSYS: + case ERESTARTNOINTR: + case ERESTARTNOHAND: + case ERESTART_RESTARTBLOCK: + case EAGAIN: + case EINTR: + ri->hooks |= (1<pn_state)) { + /* ... -> ptrace_notify() + * or + * ... -> do_signal() -> get_signal_to_deliver() -> + * ptrace stop + */ + tsk->last_siginfo = &ri->last_siginfo; + ri->hooks |= (1<last_siginfo, lsi); + } + + tsk->ptrace = ti->cpt_ptrace; + tsk->flags = (tsk->flags & PF_USED_MATH) | + (ti->cpt_flags & CPT_TASK_FLAGS_MASK); + clear_tsk_thread_flag(tsk, TIF_FREEZE); + tsk->exit_signal = ti->cpt_exit_signal; + + if (ri && tsk->stopped_state) { + dprintk_ctx("finish_stop\n"); + if (ti->cpt_state != TASK_STOPPED) + eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state); + ri->hooks |= (1<cpt_set_tid || ti->cpt_clear_tid)) { + ri->hooks |= (1<tid_ptrs[0] = ti->cpt_clear_tid; + ri->tid_ptrs[1] = ti->cpt_set_tid; + dprintk_ctx("settids\n"); + } + + if (ri && ri->hooks && + !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + if (try_module_get(THIS_MODULE)) + ri->hook = rst_resume_work; + } + + if (ti->cpt_state == TASK_TRACED) + tsk->state = TASK_TRACED; + else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) { + tsk->signal->it[CPUCLOCK_VIRT].expires = 0; + tsk->signal->it[CPUCLOCK_PROF].expires = 0; + if (tsk->state != TASK_DEAD) + eprintk_ctx("oops, schedule() did not make us dead\n"); + } + + if (thread_group_leader(tsk) && + ti->cpt_it_real_value && + !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + ktime_t val; + s64 nsec; + + nsec = ti->cpt_it_real_value; + val.tv64 = 0; + + if (ctx->image_version < CPT_VERSION_9) + nsec *= TICK_NSEC; + + val = ktime_add_ns(val, nsec - ctx->delta_nsec); + if (val.tv64 <= 0) + val.tv64 = NSEC_PER_USEC; + dprintk("rst itimer " CPT_FID " +%Ld %Lu\n", CPT_TID(tsk), + (long long)val.tv64, + (unsigned long long)ti->cpt_it_real_value); + + spin_lock_irq(&tsk->sighand->siglock); + if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) { + /* FIXME. Check!!!! */ + hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_MODE_REL); + } else { + wprintk_ctx("Timer clash. Impossible?\n"); + } + spin_unlock_irq(&tsk->sighand->siglock); + + dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk), + (unsigned long long)val.tv64); + } + + module_put(THIS_MODULE); + } + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_socket.c linux-2.6.32.48-openvz/kernel/cpt/rst_socket.c --- linux-2.6.32.48/kernel/cpt/rst_socket.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_socket.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,993 @@ +/* + * + * kernel/cpt/rst_socket.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +#include "cpt_syscalls.h" + + +static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + struct timeval tmptv; + + if (sk->sk_socket) { + sk->sk_socket->flags = si->cpt_ssflags; + sk->sk_socket->state = si->cpt_sstate; + } + sk->sk_reuse = si->cpt_reuse; + sk->sk_shutdown = si->cpt_shutdown; + sk->sk_userlocks = si->cpt_userlocks; + sk->sk_no_check = si->cpt_no_check; + sock_reset_flag(sk, SOCK_DBG); + if (si->cpt_debug) + sock_set_flag(sk, SOCK_DBG); + sock_reset_flag(sk, SOCK_RCVTSTAMP); + if (si->cpt_rcvtstamp) + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_LOCALROUTE); + if (si->cpt_localroute) + sock_set_flag(sk, SOCK_LOCALROUTE); + sk->sk_protocol = si->cpt_protocol; + sk->sk_err = si->cpt_err; + sk->sk_err_soft = si->cpt_err_soft; + sk->sk_priority = si->cpt_priority; + sk->sk_rcvlowat = si->cpt_rcvlowat; + sk->sk_rcvtimeo = si->cpt_rcvtimeo; + if (si->cpt_rcvtimeo == CPT_NULL) + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = si->cpt_sndtimeo; + if (si->cpt_sndtimeo == CPT_NULL) + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_rcvbuf = si->cpt_rcvbuf; + sk->sk_sndbuf = si->cpt_sndbuf; + sk->sk_bound_dev_if = si->cpt_bound_dev_if; + sk->sk_flags = si->cpt_flags; + sk->sk_lingertime = si->cpt_lingertime; + if (si->cpt_lingertime == CPT_NULL) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + sk->sk_peercred.pid = si->cpt_peer_pid; + sk->sk_peercred.uid = si->cpt_peer_uid; + sk->sk_peercred.gid = si->cpt_peer_gid; + cpt_timeval_import(&tmptv, si->cpt_stamp); + sk->sk_stamp = timeval_to_ktime(tmptv); + return 0; +} + +static struct file *sock_mapfile(struct socket *sock) +{ + int fd = sock_map_fd(sock, 0); + + if (fd >= 0) { + struct file *file = sock->file; + get_file(file); + sc_close(fd); + return file; + } + return ERR_PTR(fd); +} + +/* Assumption is that /tmp exists and writable. + * In previous versions we assumed that listen() will autobind + * the socket. It does not do this for AF_UNIX by evident reason: + * socket in abstract namespace is accessible, unlike socket bound + * to deleted FS object. + */ + +static int +select_deleted_name(char * name, cpt_context_t *ctx) +{ + int i; + + for (i=0; i<100; i++) { + struct nameidata nd; + unsigned int rnd = net_random(); + + sprintf(name, "/tmp/SOCK.%08x", rnd); + + if (path_lookup(name, 0, &nd) != 0) + return 0; + + path_put(&nd.path); + } + + eprintk_ctx("failed to allocate deleted socket inode\n"); + return -ELOOP; +} + +static int +bind_unix_socket(struct socket *sock, struct cpt_sock_image *si, + cpt_context_t *ctx) +{ + int err; + char *name; + struct sockaddr* addr; + int addrlen; + struct sockaddr_un sun; + struct nameidata nd; + + if ((addrlen = si->cpt_laddrlen) <= 2) + return 0; + + nd.path.dentry = NULL; + name = ((char*)si->cpt_laddr) + 2; + addr = (struct sockaddr *)si->cpt_laddr; + + if (name[0]) { + if (path_lookup(name, 0, &nd)) + nd.path.dentry = NULL; + + if (si->cpt_deleted) { + if (nd.path.dentry == NULL && + sock->ops->bind(sock, addr, addrlen) == 0) { + sc_unlink(name); + return 0; + } + + addr = (struct sockaddr*)&sun; + addr->sa_family = AF_UNIX; + name = ((char*)addr) + 2; + err = select_deleted_name(name, ctx); + if (err) + goto out; + addrlen = 2 + strlen(name); + } else if (nd.path.dentry) { + if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) { + eprintk_ctx("bind_unix_socket: not a socket dentry\n"); + err = -EINVAL; + goto out; + } + sc_unlink(name); + } + } + + err = sock->ops->bind(sock, addr, addrlen); + + if (!err && name[0]) { + if (nd.path.dentry) { + sc_chown(name, nd.path.dentry->d_inode->i_uid, + nd.path.dentry->d_inode->i_gid); + sc_chmod(name, nd.path.dentry->d_inode->i_mode); + } + if (si->cpt_deleted) + sc_unlink(name); + } + +out: + if (nd.path.dentry) + path_put(&nd.path); + return err; +} + +static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + struct sock *sk = sock->sk; + cpt_object_t *obj; + struct sock *parent; + + if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN) + return 0; + + if (si->cpt_parent == -1) + return bind_unix_socket(sock, si, ctx); + + obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + if (!obj) + return 0; + + parent = obj->o_obj; + if (unix_sk(parent)->addr) { + if (unix_sk(sk)->addr && + atomic_dec_and_test(&unix_sk(sk)->addr->refcnt)) + kfree(unix_sk(sk)->addr); + atomic_inc(&unix_sk(parent)->addr->refcnt); + unix_sk(sk)->addr = unix_sk(parent)->addr; + } + return 0; +} + +static int generic_restore_queues(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + __u32 type; + + skb = rst_skb(sk, &pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else { + wprintk_ctx("strange socket queue type %u\n", type); + kfree_skb(skb); + } + } + return 0; +} + +static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct socket *sock2 = NULL; + struct file *file; + cpt_object_t *fobj; + cpt_object_t *pobj = NULL; + + err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol, + &sock); + if (err) + return err; + + if (si->cpt_socketpair) { + err = sock_create(si->cpt_family, si->cpt_type, + si->cpt_protocol, &sock2); + if (err) + goto err_out; + + err = sock->ops->socketpair(sock, sock2); + if (err < 0) + goto err_out; + + /* Socketpair with a peer outside our environment. + * So, we create real half-open pipe and do not worry + * about dead end anymore. */ + if (si->cpt_peer == -1) { + sock_release(sock2); + sock2 = NULL; + } + } + + cpt_obj_setobj(obj, sock->sk, ctx); + + if (si->cpt_file != CPT_NULL) { + file = sock_mapfile(sock); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto err_out; + + err = -ENOMEM; + + obj->o_parent = file; + + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(fobj, si->cpt_file, ctx); + cpt_obj_setindex(fobj, si->cpt_index, ctx); + } + + if (sock2) { + struct file *file2; + + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx); + if (!pobj) BUG(); + if (pobj->o_obj) BUG(); + cpt_obj_setobj(pobj, sock2->sk, ctx); + + if (pobj->o_ppos != CPT_NULL) { + file2 = sock_mapfile(sock2); + err = PTR_ERR(file2); + if (IS_ERR(file2)) + goto err_out; + + err = -ENOMEM; + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(fobj, pobj->o_ppos, ctx); + cpt_obj_setindex(fobj, si->cpt_peer, ctx); + + pobj->o_parent = file2; + } + } + + setup_sock_common(sock->sk, si, obj->o_pos, ctx); + if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) { + int saved_reuse = sock->sk->sk_reuse; + + inet_sk(sock->sk)->freebind = 1; + sock->sk->sk_reuse = 2; + if (si->cpt_laddrlen) { + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + if (err) { + dprintk_ctx("binding failed: %d, do not worry\n", err); + } + } + sock->sk->sk_reuse = saved_reuse; + rst_socket_in(si, obj->o_pos, sock->sk, ctx); + } else if (sock->sk->sk_family == AF_NETLINK) { + struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr; + if (nl->nl_pid) { + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + if (err) { + eprintk_ctx("AF_NETLINK binding failed: %d\n", err); + } + } + if (si->cpt_raddrlen && nl->nl_pid) { + err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK); + if (err) { + eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err); + } + } + generic_restore_queues(sock->sk, si, obj->o_pos, ctx); + } else if (sock->sk->sk_family == PF_PACKET) { + struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr; + if (ll->sll_protocol || ll->sll_ifindex) { + int alen = si->cpt_laddrlen; + if (alen < sizeof(struct sockaddr_ll)) + alen = sizeof(struct sockaddr_ll); + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen); + if (err) { + eprintk_ctx("AF_PACKET binding failed: %d\n", err); + } + } + generic_restore_queues(sock->sk, si, obj->o_pos, ctx); + } + fixup_unix_address(sock, si, ctx); + + if (sock2) { + err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx); + if (err) + return err; + setup_sock_common(sock2->sk, si, pobj->o_pos, ctx); + fixup_unix_address(sock2, si, ctx); + } + + if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) + && (int)si->cpt_parent != -1) { + cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0) + sock->sk = NULL; + } + + + if (si->cpt_file == CPT_NULL && sock->sk && + sock->sk->sk_family == AF_INET) { + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + + local_bh_disable(); + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + eprintk_ctx("oops, sock is locked by user\n"); + + sock_hold(sk); + sock_orphan(sk); + ub_inc_orphan_count(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + dprintk_ctx("orphaning socket %p\n", sk); + } + } + + if (si->cpt_file == CPT_NULL && sock->sk == NULL) + sock_release(sock); + + return 0; + +err_out: + if (sock2) + sock_release(sock2); + sock_release(sock); + return err; +} + +static int open_listening_socket(loff_t pos, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct file *file; + cpt_object_t *obj, *fobj; + + err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol, + &sock); + if (err) { + eprintk_ctx("open_listening_socket: sock_create: %d\n", err); + return err; + } + + sock->sk->sk_reuse = 2; + sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if; + + if (sock->sk->sk_family == AF_UNIX) { + err = bind_unix_socket(sock, si, ctx); + } else if (si->cpt_laddrlen) { + if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) + inet_sk(sock->sk)->freebind = 1; + + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + + if (err) { + eprintk_ctx("open_listening_socket: bind: %d\n", err); + goto err_out; + } + } + + err = sock->ops->listen(sock, si->cpt_max_ack_backlog); + if (err) { + eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted); + goto err_out; + } + + /* Now we may access socket body directly and fixup all the things. */ + + file = sock_mapfile(sock); + err = PTR_ERR(file); + if (IS_ERR(file)) { + eprintk_ctx("open_listening_socket: map: %d\n", err); + goto err_out; + } + + err = -ENOMEM; + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) + goto err_out; + if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(obj, pos, ctx); + cpt_obj_setindex(obj, si->cpt_index, ctx); + obj->o_parent = file; + cpt_obj_setpos(fobj, si->cpt_file, ctx); + cpt_obj_setindex(fobj, si->cpt_index, ctx); + + setup_sock_common(sock->sk, si, pos, ctx); + + if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) { + rst_listen_socket_in(sock->sk, si, pos, ctx); + rst_restore_synwait_queue(sock->sk, si, pos, ctx); + } + + return 0; + +err_out: + sock_release(sock); + return err; +} + +static int +rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + loff_t pos = *pos_p; + struct cpt_sockmc_image v; + + err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx); + if (err) + return err; + + *pos_p += v.cpt_next; + + if (v.cpt_family == AF_INET) + return rst_sk_mcfilter_in(sk, &v, pos, ctx); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (v.cpt_family == AF_INET6) + return rst_sk_mcfilter_in6(sk, &v, pos, ctx); +#endif + else + return -EAFNOSUPPORT; +} + + +static int +rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + struct sk_filter *fp, *old_fp; + loff_t pos = *pos_p; + struct cpt_obj_bits v; + + err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx); + if (err) + return err; + + *pos_p += v.cpt_next; + + if (v.cpt_size % sizeof(struct sock_filter)) + return -EINVAL; + + fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC); + if (fp == NULL) + return -ENOMEM; + atomic_set(&fp->refcnt, 1); + fp->len = v.cpt_size/sizeof(struct sock_filter); + + err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen); + if (err) { + sk_filter_uncharge(sk, fp); + return err; + } + + old_fp = sk->sk_filter; + sk->sk_filter = fp; + if (old_fp) + sk_filter_uncharge(sk, old_fp); + return 0; +} + + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + loff_t pos = *pos_p; + + err = rst_sock_attr_skfilter(pos_p, sk, ctx); + if (err && pos == *pos_p) + err = rst_sock_attr_mcfilter(pos_p, sk, ctx); + return err; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static void rst_tcp_cb_ipv4_to_ipv6(struct cpt_skb_image *v, struct sk_buff *skb) +{ + BUG_ON(sizeof(skb->cb) - sizeof(struct inet6_skb_parm) < + sizeof(struct tcp_skb_cb) - sizeof(struct inet6_skb_parm)); + memcpy(skb->cb, v->cpt_cb, sizeof(struct inet_skb_parm)); + memcpy(skb->cb + sizeof(struct inet6_skb_parm), + (void *)v->cpt_cb + sizeof(struct inet_skb_parm), + sizeof(struct tcp_skb_cb) - sizeof(struct inet6_skb_parm)); +} +#else +static void rst_tcp_cb_ipv6_to_ipv4(struct cpt_skb_image *v, struct sk_buff *skb) +{ + BUG_ON(sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm) < + sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm)); + memcpy(skb->cb, v->cpt_cb, sizeof(struct inet_skb_parm)); + memcpy(skb->cb + sizeof(struct inet_skb_parm), + (void *)v->cpt_cb + sizeof(struct inet6_skb_parm), + sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm)); +} +#endif + +struct tcp_skb_cb_ipv6 { + union { + struct inet_skb_parm h4; + struct inet6_skb_parm h6; + } header; + __u32 seq; + __u32 end_seq; + __u32 when; + __u8 flags; + __u8 sacked; + __u16 urg_ptr; + __u32 ack_seq; +}; + +#define check_tcp_cb_conv(op1, op2) do { \ + if (!ctx->tcp_cb_convert) \ + ctx->tcp_cb_convert = CPT_TCP_CB_##op1; \ + else if (ctx->tcp_cb_convert == CPT_TCP_CB_##op2) { \ + kfree_skb(skb); \ + return ERR_PTR(-EINVAL); \ + } \ +} while (0) + +struct sk_buff * rst_skb(struct sock *sk, loff_t *pos_p, __u32 *owner, + __u32 *queue, struct cpt_context *ctx) +{ + int err; + struct sk_buff *skb; + struct cpt_skb_image v; + loff_t pos = *pos_p; + struct scm_fp_list *fpl = NULL; + struct timeval tmptv; + + err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx); + if (err) + return ERR_PTR(err); + *pos_p = pos + v.cpt_next; + + if (owner) + *owner = v.cpt_owner; + if (queue) + *queue = v.cpt_queue; + + skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL); + if (skb == NULL) + return ERR_PTR(-ENOMEM); + skb_reserve(skb, v.cpt_hspace); + skb_put(skb, v.cpt_len); +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->transport_header = v.cpt_h; + skb->network_header = v.cpt_nh; + skb->mac_header = v.cpt_mac; +#else + skb->transport_header = skb->head + v.cpt_h; + skb->network_header = skb->head + v.cpt_nh; + skb->mac_header = skb->head + v.cpt_mac; +#endif + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb)); + if (sk->sk_protocol == IPPROTO_TCP) { + /* + * According to Alexey all packets in queue have non-zero + * flags, as at least TCPCB_FLAG_ACK is set on them. + * Luckily for us, offset of field flags in tcp_skb_cb struct + * with IPv6 is higher then total size of tcp_skb_cb struct + * without IPv6. + */ + if (ctx->image_version >= CPT_VERSION_18_2 || + ((struct tcp_skb_cb_ipv6 *)&v.cpt_cb)->flags) { +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + check_tcp_cb_conv(NOT_CONV, CONV); + memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); +#else + check_tcp_cb_conv(CONV, NOT_CONV); + rst_tcp_cb_ipv6_to_ipv4(&v, skb); +#endif + } else { +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + check_tcp_cb_conv(CONV, NOT_CONV); + rst_tcp_cb_ipv4_to_ipv6(&v, skb); +#else + check_tcp_cb_conv(NOT_CONV, CONV); + memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); +#endif + } + } else + memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); + skb->mac_len = v.cpt_mac_len; + + skb->csum = v.cpt_csum; + skb->local_df = v.cpt_local_df; + skb->pkt_type = v.cpt_pkt_type; + skb->ip_summed = v.cpt_ip_summed; + skb->priority = v.cpt_priority; + skb->protocol = v.cpt_protocol; + cpt_timeval_import(&tmptv, v.cpt_stamp); + skb->tstamp = timeval_to_ktime(tmptv); + + skb_shinfo(skb)->gso_segs = v.cpt_gso_segs; + skb_shinfo(skb)->gso_size = v.cpt_gso_size; + if (ctx->image_version == 0) { + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + } + + if (v.cpt_next > v.cpt_hdrlen) { + pos = pos + v.cpt_hdrlen; + while (pos < *pos_p) { + union { + struct cpt_obj_bits b; + struct cpt_fd_image f; + } u; + + err = rst_get_object(-1, pos, &u, ctx); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + if (u.b.cpt_object == CPT_OBJ_BITS) { + if (u.b.cpt_size != v.cpt_hspace + skb->len) { + eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len); + kfree_skb(skb); + return ERR_PTR(-EINVAL); + } + + err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) { + if (!fpl) { + fpl = kmalloc(sizeof(struct scm_fp_list), + GFP_KERNEL_UBC); + if (!fpl) { + kfree_skb(skb); + return ERR_PTR(-ENOMEM); + } + fpl->count = 0; + UNIXCB(skb).fp = fpl; + } + fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx); + if (!IS_ERR(fpl->fp[fpl->count])) + fpl->count++; + } + pos += u.b.cpt_next; + } + } + + return skb; +} + +static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + struct sock *owner_sk; + __u32 owner; + + skb = rst_skb(sk, &pos, &owner, NULL, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + owner_sk = unix_peer(sk); + if (owner != -1) { + cpt_object_t *pobj; + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx); + if (pobj == NULL) { + eprintk_ctx("orphan af_unix skb?\n"); + kfree_skb(skb); + continue; + } + owner_sk = pobj->o_obj; + } + if (owner_sk == NULL) { + dprintk_ctx("orphan af_unix skb 2?\n"); + kfree_skb(skb); + continue; + } + skb_set_owner_w(skb, owner_sk); + if (UNIXCB(skb).fp) + skb->destructor = unix_destruct_fds; + skb_queue_tail(&sk->sk_receive_queue, skb); + if (sk->sk_state == TCP_LISTEN) { + struct socket *sock = skb->sk->sk_socket; + if (sock == NULL) BUG(); + if (sock->file) BUG(); + skb->sk->sk_socket = NULL; + skb->sk->sk_sleep = NULL; + sock->sk = NULL; + sock_release(sock); + } + } + return 0; +} + + +/* All the sockets are created before we start to open files */ + +int rst_sockets(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SOCKET]; + loff_t endsec; + cpt_object_t *obj; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) { + eprintk_ctx("rst_sockets: ctx->pread: %d\n", err); + return err; + } + if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) { + eprintk_ctx("rst_sockets: hdr err\n"); + return -EINVAL; + } + + /* The first pass: we create socket index and open listening sockets. */ + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_sock_image *sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); + if (err) { + eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); + cpt_release_buf(ctx); + return err; + } + if (sbuf->cpt_state == TCP_LISTEN) { + err = open_listening_socket(sec, sbuf, ctx); + cpt_release_buf(ctx); + if (err) { + eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err); + return err; + } + } else { + cpt_release_buf(ctx); + obj = alloc_cpt_object(GFP_KERNEL, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setindex(obj, sbuf->cpt_index, ctx); + cpt_obj_setpos(obj, sec, ctx); + obj->o_ppos = sbuf->cpt_file; + intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx); + } + sec += sbuf->cpt_next; + } + + /* Pass 2: really restore sockets */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct cpt_sock_image *sbuf; + if (obj->o_obj != NULL) + continue; + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); + cpt_release_buf(ctx); + return err; + } + if (sbuf->cpt_state == TCP_LISTEN) BUG(); + err = open_socket(obj, sbuf, ctx); + cpt_release_buf(ctx); + if (err) { + eprintk_ctx("rst_sockets: open_socket: %d\n", err); + return err; + } + } + + return 0; +} + +int rst_orphans(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_ORPHANS]; + loff_t endsec; + cpt_object_t *obj; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_sock_image *sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + obj = alloc_cpt_object(GFP_KERNEL, ctx); + if (obj == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + obj->o_pos = sec; + obj->o_ppos = sbuf->cpt_file; + err = open_socket(obj, sbuf, ctx); + dprintk_ctx("Restoring orphan: %d\n", err); + free_cpt_object(obj, ctx); + cpt_release_buf(ctx); + if (err) + return err; + sec += sbuf->cpt_next; + } + + return 0; +} + + +/* Pass 3: I understand, this is not funny already :-), + * but we have to do another pass to establish links between + * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX + * skb queues with proper skb->sk links. + * + * This could be made at the end of rst_sockets(), but we defer + * restoring af_unix queues up to the end of restoring files to + * make restoring passed FDs cleaner. + */ + +int rst_sockets_complete(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_SOCKET) { + struct cpt_sock_image *sbuf; + struct sock *sk = obj->o_obj; + struct sock *peer; + + if (!sk) BUG(); + + if (sk->sk_family != AF_UNIX) + continue; + + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (sbuf->cpt_next > sbuf->cpt_hdrlen) + restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx); + + cpt_release_buf(ctx); + + if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) { + cpt_object_t *pobj; + + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (sbuf->cpt_peer != -1) { + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx); + if (pobj) { + peer = pobj->o_obj; + sock_hold(peer); + unix_peer(sk) = peer; + } + } + cpt_release_buf(ctx); + } + } + + rst_orphans(ctx); + + return 0; +} + diff -urNp linux-2.6.32.48/kernel/cpt/rst_socket_in.c linux-2.6.32.48-openvz/kernel/cpt/rst_socket_in.c --- linux-2.6.32.48/kernel/cpt/rst_socket_in.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_socket_in.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,578 @@ +/* + * + * kernel/cpt/rst_socket_in.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +static inline unsigned long jiffies_import(__u32 tmo) +{ + __s32 delta = tmo; + return jiffies + (long)delta; +} + +static inline __u32 tcp_jiffies_import(__u32 tmo) +{ + return ((__u32)jiffies) + tmo; +} + + +static int restore_queues(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + __u32 type; + + skb = rst_skb(sk, &pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + if (sk->sk_type == SOCK_STREAM) { + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + ub_tcprcvbuf_charge_forced(sk, skb); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else if (type == CPT_SKB_OFOQ) { + struct tcp_sock *tp = tcp_sk(sk); + skb_set_owner_r(skb, sk); + ub_tcprcvbuf_charge_forced(sk, skb); + skb_queue_tail(&tp->out_of_order_queue, skb); + } else if (type == CPT_SKB_WQ) { + sk->sk_wmem_queued += skb->truesize; + sk->sk_forward_alloc -= skb->truesize; + ub_tcpsndbuf_charge_forced(sk, skb); + skb_queue_tail(&sk->sk_write_queue, skb); + } else { + wprintk_ctx("strange stream queue type %u\n", type); + kfree_skb(skb); + } + } else { + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else if (type == CPT_SKB_WQ) { + struct inet_sock *inet = inet_sk(sk); + if (inet->cork.fragsize) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&sk->sk_write_queue, skb); + } else { + eprintk_ctx("cork skb is dropped\n"); + kfree_skb(skb); + } + } else { + wprintk_ctx("strange dgram queue type %u\n", type); + kfree_skb(skb); + } + } + } + return 0; +} + +static struct sock *find_parent(__u16 sport, cpt_context_t *ctx) +{ + cpt_object_t *obj; + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && + sk->sk_state == TCP_LISTEN && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && + inet_sk(sk)->sport == sport) + return sk; + } + return NULL; +} + +static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + tp->pred_flags = si->cpt_pred_flags; + tp->rcv_nxt = si->cpt_rcv_nxt; + tp->snd_nxt = si->cpt_snd_nxt; + tp->snd_una = si->cpt_snd_una; + tp->snd_sml = si->cpt_snd_sml; + tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); + tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); + tp->tcp_header_len = si->cpt_tcp_header_len; + inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending; + inet_csk(sk)->icsk_ack.quick = si->cpt_quick; + inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong; + inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked; + inet_csk(sk)->icsk_ack.ato = si->cpt_ato; + inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout); + inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime); + inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size; + inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss; + tp->snd_wl1 = si->cpt_snd_wl1; + tp->snd_wnd = si->cpt_snd_wnd; + tp->max_window = si->cpt_max_window; + inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie; + tp->mss_cache = si->cpt_mss_cache; + tp->rx_opt.mss_clamp = si->cpt_mss_clamp; + inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; + inet_csk(sk)->icsk_ca_state = si->cpt_ca_state; + inet_csk(sk)->icsk_retransmits = si->cpt_retransmits; + tp->reordering = si->cpt_reordering; + tp->frto_counter = si->cpt_frto_counter; + tp->frto_highmark = si->cpt_frto_highmark; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + // // tp->adv_cong = si->cpt_adv_cong; +#endif + inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; + inet_csk(sk)->icsk_backoff = si->cpt_backoff; + tp->srtt = si->cpt_srtt; + tp->mdev = si->cpt_mdev; + tp->mdev_max = si->cpt_mdev_max; + tp->rttvar = si->cpt_rttvar; + tp->rtt_seq = si->cpt_rtt_seq; + inet_csk(sk)->icsk_rto = si->cpt_rto; + tp->packets_out = si->cpt_packets_out; + tp->retrans_out = si->cpt_retrans_out; + tp->lost_out = si->cpt_lost_out; + tp->sacked_out = si->cpt_sacked_out; + tp->fackets_out = si->cpt_fackets_out; + tp->snd_ssthresh = si->cpt_snd_ssthresh; + tp->snd_cwnd = si->cpt_snd_cwnd; + tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt; + tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp; + tp->snd_cwnd_used = si->cpt_snd_cwnd_used; + tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp); + inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout); + tp->rcv_wnd = si->cpt_rcv_wnd; + tp->rcv_wup = si->cpt_rcv_wup; + tp->write_seq = si->cpt_write_seq; + tp->pushed_seq = si->cpt_pushed_seq; + tp->copied_seq = si->cpt_copied_seq; + tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok; + tp->rx_opt.wscale_ok = si->cpt_wscale_ok; + tp->rx_opt.sack_ok = si->cpt_sack_ok; + tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp; + tp->rx_opt.snd_wscale = si->cpt_snd_wscale; + tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale; + tp->nonagle = si->cpt_nonagle; + tp->keepalive_probes = si->cpt_keepalive_probes; + tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval; + tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr; + tp->rx_opt.ts_recent = si->cpt_ts_recent; + tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp; + tp->rx_opt.user_mss = si->cpt_user_mss; + tp->rx_opt.dsack = si->cpt_dsack; + tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0]; + tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1]; + tp->selective_acks[0].start_seq = si->cpt_sack_array[2]; + tp->selective_acks[0].end_seq = si->cpt_sack_array[3]; + tp->selective_acks[1].start_seq = si->cpt_sack_array[4]; + tp->selective_acks[1].end_seq = si->cpt_sack_array[5]; + tp->selective_acks[2].start_seq = si->cpt_sack_array[6]; + tp->selective_acks[2].end_seq = si->cpt_sack_array[7]; + tp->selective_acks[3].start_seq = si->cpt_sack_array[8]; + tp->selective_acks[3].end_seq = si->cpt_sack_array[9]; + + tp->window_clamp = si->cpt_window_clamp; + tp->rcv_ssthresh = si->cpt_rcv_ssthresh; + inet_csk(sk)->icsk_probes_out = si->cpt_probes_out; + tp->rx_opt.num_sacks = si->cpt_num_sacks; + tp->advmss = si->cpt_advmss; + inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; + tp->ecn_flags = si->cpt_ecn_flags; + tp->prior_ssthresh = si->cpt_prior_ssthresh; + tp->high_seq = si->cpt_high_seq; + tp->retrans_stamp = si->cpt_retrans_stamp; + tp->undo_marker = si->cpt_undo_marker; + tp->undo_retrans = si->cpt_undo_retrans; + tp->urg_seq = si->cpt_urg_seq; + tp->urg_data = si->cpt_urg_data; + inet_csk(sk)->icsk_pending = si->cpt_pending; + tp->snd_up = si->cpt_snd_up; + tp->keepalive_time = si->cpt_keepalive_time; + tp->keepalive_intvl = si->cpt_keepalive_intvl; + tp->linger2 = si->cpt_linger2; + + sk->sk_send_head = NULL; + for (skb = skb_peek(&sk->sk_write_queue); + skb && skb != (struct sk_buff*)&sk->sk_write_queue; + skb = skb->next) { + if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) { + sk->sk_send_head = skb; + break; + } + } + + if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) { + struct inet_sock *inet = inet_sk(sk); + if (inet->num == 0) { + cpt_object_t *lobj = NULL; + + if ((int)si->cpt_parent != -1) + lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + + if (lobj && lobj->o_obj) { + inet->num = ntohs(inet->sport); + local_bh_disable(); + __inet_inherit_port(lobj->o_obj, sk); + local_bh_enable(); + dprintk_ctx("port inherited from parent\n"); + } else { + struct sock *lsk = find_parent(inet->sport, ctx); + if (lsk) { + inet->num = ntohs(inet->sport); + local_bh_disable(); + __inet_inherit_port(lsk, sk); + local_bh_enable(); + dprintk_ctx("port inherited\n"); + } else { + eprintk_ctx("we are kinda lost...\n"); + } + } + } + + sk->sk_prot->hash(sk); + + if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER) + sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout); + if (inet_csk(sk)->icsk_pending) + sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer, + inet_csk(sk)->icsk_timeout); + if (sock_flag(sk, SOCK_KEEPOPEN)) { + unsigned long expires = jiffies_import(si->cpt_ka_timeout); + if (time_after(jiffies, expires)) + expires = jiffies + HZ; + sk_reset_timer(sk, &sk->sk_timer, expires); + } + } + + if (sk->sk_family == AF_INET6) + sk->sk_gso_type = SKB_GSO_TCPV6; + else + sk->sk_gso_type = SKB_GSO_TCPV4; + + return 0; +} + +static void rst_listen_socket_tcp(struct cpt_sock_image *si, struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); + tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); + tp->tcp_header_len = si->cpt_tcp_header_len; + inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; + + /* Next options are inherited by children */ + tp->mss_cache = si->cpt_mss_cache; + inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; + tp->reordering = si->cpt_reordering; + tp->nonagle = si->cpt_nonagle; + tp->keepalive_probes = si->cpt_keepalive_probes; + tp->rx_opt.user_mss = si->cpt_user_mss; + inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; + tp->keepalive_time = si->cpt_keepalive_time; + tp->keepalive_intvl = si->cpt_keepalive_intvl; + tp->linger2 = si->cpt_linger2; +} + +int rst_listen_socket_in( struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + + lock_sock(sk); + + inet->uc_ttl = si->cpt_uc_ttl; + inet->tos = si->cpt_tos; + inet->cmsg_flags = si->cpt_cmsg_flags; + inet->pmtudisc = si->cpt_pmtudisc; + inet->recverr = si->cpt_recverr; + inet->freebind = si->cpt_freebind; + inet->id = si->cpt_idcounter; + + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + np->frag_size = si->cpt_frag_size6; + np->hop_limit = si->cpt_hop_limit6; + + np->rxopt.all = si->cpt_rxopt6; + np->mc_loop = si->cpt_mc_loop6; + np->recverr = si->cpt_recverr6; + np->pmtudisc = si->cpt_pmtudisc6; + np->ipv6only = si->cpt_ipv6only6; + } + + if (sk->sk_protocol == IPPROTO_TCP) + rst_listen_socket_tcp(si, sk); + + release_sock(sk); + return 0; +} + +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct net *net = get_exec_env()->ve_ns->net_ns; + + lock_sock(sk); + + sk->sk_state = si->cpt_state; + + inet->daddr = si->cpt_daddr; + inet->dport = si->cpt_dport; + inet->saddr = si->cpt_saddr; + inet->rcv_saddr = si->cpt_rcv_saddr; + inet->sport = si->cpt_sport; + inet->uc_ttl = si->cpt_uc_ttl; + inet->tos = si->cpt_tos; + inet->cmsg_flags = si->cpt_cmsg_flags; + inet->mc_index = si->cpt_mc_index; + inet->mc_addr = si->cpt_mc_addr; + inet->hdrincl = si->cpt_hdrincl; + inet->mc_ttl = si->cpt_mc_ttl; + inet->mc_loop = si->cpt_mc_loop; + inet->pmtudisc = si->cpt_pmtudisc; + inet->recverr = si->cpt_recverr; + inet->freebind = si->cpt_freebind; + inet->id = si->cpt_idcounter; + + inet->cork.flags = si->cpt_cork_flags; + inet->cork.fragsize = si->cpt_cork_fragsize; + inet->cork.length = si->cpt_cork_length; + inet->cork.addr = si->cpt_cork_addr; + inet->cork.fl.fl4_src = si->cpt_cork_saddr; + inet->cork.fl.fl4_dst = si->cpt_cork_daddr; + inet->cork.fl.oif = si->cpt_cork_oif; + if (inet->cork.fragsize) { + if (ip_route_output_key(net, (struct rtable **)&inet->cork.dst, &inet->cork.fl)) { + eprintk_ctx("failed to restore cork route\n"); + inet->cork.fragsize = 0; + } + } + + if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { + struct udp_sock *up = udp_sk(sk); + up->pending = si->cpt_udp_pending; + up->corkflag = si->cpt_udp_corkflag; + up->encap_type = si->cpt_udp_encap; + up->len = si->cpt_udp_len; + } + + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + memcpy(&np->saddr, si->cpt_saddr6, 16); + memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16); + memcpy(&np->daddr, si->cpt_daddr6, 16); + np->flow_label = si->cpt_flow_label6; + np->frag_size = si->cpt_frag_size6; + np->hop_limit = si->cpt_hop_limit6; + np->mcast_hops = si->cpt_mcast_hops6; + np->mcast_oif = si->cpt_mcast_oif6; + np->rxopt.all = si->cpt_rxopt6; + np->mc_loop = si->cpt_mc_loop6; + np->recverr = si->cpt_recverr6; + np->sndflow = si->cpt_sndflow6; + np->pmtudisc = si->cpt_pmtudisc6; + np->ipv6only = si->cpt_ipv6only6; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (si->cpt_mapped) { + extern struct inet_connection_sock_af_ops ipv6_mapped; + if (sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) { + inet_csk(sk)->icsk_af_ops = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; + } + } +#endif + } + + restore_queues(sk, si, pos, ctx); + + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + rst_socket_tcp(si, pos, sk, ctx); + + release_sock(sk); + return 0; +} + +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx) +{ + struct request_sock *req; + + if (lsk->sk_state != TCP_LISTEN) + return -EINVAL; + + req = reqsk_alloc(&tcp_request_sock_ops); + if (!req) + return -ENOMEM; + + sk->sk_socket = NULL; + sk->sk_sleep = NULL; + inet_csk_reqsk_queue_add(lsk, req, sk); + return 0; +} + +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end = pos + si->cpt_next; + + pos += si->cpt_hdrlen; + + lock_sock(sk); + while (pos < end) { + struct cpt_openreq_image oi; + + err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx); + if (err) { + err = rst_sock_attr(&pos, sk, ctx); + if (err) { + release_sock(sk); + return err; + } + + continue; + } + + if (oi.cpt_object == CPT_OBJ_OPENREQ) { + struct request_sock *req; + + if (oi.cpt_family == AF_INET6 && + sk->sk_family != AF_INET6) + /* related to non initialized cpt_family bug */ + goto next; + + if (oi.cpt_family == AF_INET6) { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + req = reqsk_alloc(&tcp6_request_sock_ops); +#else + release_sock(sk); + return -EINVAL; +#endif + } else { + req = reqsk_alloc(&tcp_request_sock_ops); + } + + if (req == NULL) { + release_sock(sk); + return -ENOMEM; + } + + tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn; + tcp_rsk(req)->snt_isn = oi.cpt_snt_isn; + inet_rsk(req)->rmt_port = oi.cpt_rmt_port; + req->mss = oi.cpt_mss; + req->retrans = oi.cpt_retrans; + inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale; + inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale; + inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok; + inet_rsk(req)->sack_ok = oi.cpt_sack_ok; + inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok; + inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok; + inet_rsk(req)->acked = oi.cpt_acked; + inet_rsk(req)->opt = NULL; + req->window_clamp = oi.cpt_window_clamp; + req->rcv_wnd = oi.cpt_rcv_wnd; + req->ts_recent = oi.cpt_ts_recent; + req->expires = jiffies_import(oi.cpt_expires); + req->sk = NULL; + req->secid = 0; + req->peer_secid = 0; + + if (oi.cpt_family == AF_INET6) { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + inet6_rsk(req)->pktopts = NULL; + memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16); + memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16); + inet6_rsk(req)->iif = oi.cpt_iif; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +#endif + } else { + memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); + memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } + } +next: + pos += oi.cpt_next; + } + release_sock(sk); + return 0; +} + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx) +{ + struct ip_mreqn imr; + + if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { + eprintk_ctx("IGMPv3 is still not supported\n"); + return -EINVAL; + } + + memset(&imr, 0, sizeof(imr)); + imr.imr_ifindex = v->cpt_ifindex; + imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0]; + return ip_mc_join_group(sk, &imr); +} + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx) +{ + + if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { + eprintk_ctx("IGMPv3 is still not supported\n"); + return -EINVAL; + } + + return ipv6_sock_mc_join(sk, v->cpt_ifindex, + (struct in6_addr*)v->cpt_mcaddr); +} +#endif diff -urNp linux-2.6.32.48/kernel/cpt/rst_sysvipc.c linux-2.6.32.48-openvz/kernel/cpt/rst_sysvipc.c --- linux-2.6.32.48/kernel/cpt/rst_sysvipc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_sysvipc.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,639 @@ +/* + * + * kernel/cpt/rst_sysvipc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" + +struct _warg { + struct file *file; + struct cpt_sysvshm_image *v; +}; + +static int fixup_one_shm(struct shmid_kernel *shp, void *arg) +{ + struct _warg *warg = arg; + + if (shp->shm_file != warg->file) + return 0; + if (shp->shm_nattch) + return -EEXIST; + + shp->shm_perm.uid = warg->v->cpt_uid; + shp->shm_perm.gid = warg->v->cpt_gid; + shp->shm_perm.cuid = warg->v->cpt_cuid; + shp->shm_perm.cgid = warg->v->cpt_cgid; + shp->shm_perm.mode = warg->v->cpt_mode; + + shp->shm_atim = warg->v->cpt_atime; + shp->shm_dtim = warg->v->cpt_dtime; + shp->shm_ctim = warg->v->cpt_ctime; + shp->shm_cprid = warg->v->cpt_creator; + shp->shm_lprid = warg->v->cpt_last; + + /* TODO: fix shp->mlock_user? */ + return 1; +} + +static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v) +{ + struct _warg warg; + + warg.file = file; + warg.v = v; + + return sysvipc_walk_shm(fixup_one_shm, &warg); +} + +static int fixup_shm_data(struct file *file, loff_t pos, loff_t end, + struct cpt_context *ctx) +{ + struct cpt_page_block pgb; + ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); + + do_write = file->f_dentry->d_inode->i_fop->write; + if (do_write == NULL) { + eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n"); + return -EINVAL; + } + + while (pos < end) { + loff_t opos; + loff_t ipos; + int count; + int err; + + err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); + if (err) + return err; + dprintk_ctx("restoring SHM block: %08x-%08x\n", + (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); + ipos = pos + pgb.cpt_hdrlen; + opos = pgb.cpt_start; + count = pgb.cpt_end-pgb.cpt_start; + while (count > 0) { + mm_segment_t oldfs; + int copy = count; + + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + oldfs = get_fs(); set_fs(KERNEL_DS); + err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); + set_fs(oldfs); + if (err) { + __cpt_release_buf(ctx); + return err; + } + oldfs = get_fs(); set_fs(KERNEL_DS); + ipos += copy; + err = do_write(file, ctx->tmpbuf, copy, &opos); + set_fs(oldfs); + __cpt_release_buf(ctx); + if (err != copy) { + eprintk_ctx("write() failure\n"); + if (err >= 0) + err = -EIO; + return err; + } + count -= copy; + } + pos += pgb.cpt_next; + } + return 0; +} + +struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx) +{ + struct file *file; + int err; + loff_t dpos, epos; + union { + struct cpt_file_image fi; + struct cpt_sysvshm_image shmi; + struct cpt_inode_image ii; + } u; + + err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); + if (err < 0) + goto err_out; + pos = u.fi.cpt_inode; + err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); + if (err < 0) + goto err_out; + dpos = pos + u.ii.cpt_hdrlen; + epos = pos + u.ii.cpt_next; + err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); + if (err < 0) + goto err_out; + dpos += u.shmi.cpt_next; + + file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id, + u.shmi.cpt_segsz, u.shmi.cpt_mode); + if (!IS_ERR(file)) { + err = fixup_shm(file, &u.shmi); + if (err != -EEXIST && dpos < epos) { + err = fixup_shm_data(file, dpos, epos, ctx); + if (err) + goto err_put; + } + } else if (IS_ERR(file) && PTR_ERR(file) == -EEXIST) { + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + struct shmid_kernel *shp; + + shp = shm_lock(ipc_ns, u.shmi.cpt_id); + BUG_ON(IS_ERR(shp)); + get_file(shp->shm_file); + file = shp->shm_file; + shm_unlock(shp); + } + return file; + +err_put: + fput(file); +err_out: + return ERR_PTR(err); +} + +struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx) +{ + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + struct file *file; + union { + struct cpt_file_image fi; + struct cpt_inode_image ii; + struct cpt_sysvshm_image shmi; + } u; + struct shmid_kernel *shp; + struct shm_file_data *sfd; + struct path path; + mode_t f_mode; + loff_t pos; + int err; + + pos = vmai->cpt_file; + file = rst_sysv_shm_itself(pos, ctx); + if (IS_ERR(file) && PTR_ERR(file) != -EEXIST) + return file; + fput(file); + + err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); + if (err < 0) + goto err_out; + pos = u.fi.cpt_inode; + err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); + if (err < 0) + goto err_out; + err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); + if (err < 0) + goto err_out; + + shp = shm_lock(ipc_ns, u.shmi.cpt_id); + BUG_ON(IS_ERR(shp)); + path.dentry = dget(shp->shm_file->f_path.dentry); + path.mnt = shp->shm_file->f_path.mnt; + shm_unlock(shp); + + err = -ENOMEM; + sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); + if (!sfd) + goto out_put_dentry; + + f_mode = 0; + if (vmai->cpt_flags & VM_READ) + f_mode |= FMODE_READ; + if (vmai->cpt_flags & VM_WRITE) + f_mode |= FMODE_WRITE; + if (vmai->cpt_flags & VM_EXEC) + f_mode |= FMODE_EXEC; + + err = -ENOMEM; + file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); + if (!file) + goto out_free; + + file->private_data = sfd; + file->f_mapping = shp->shm_file->f_mapping; + sfd->id = shp->shm_perm.id; + sfd->ns = get_ipc_ns(ipc_ns); + sfd->file = shp->shm_file; + sfd->vm_ops = NULL; + + return file; + +out_free: + kfree(sfd); +out_put_dentry: + dput(path.dentry); +err_out: + return ERR_PTR(err); +} + +static int attach_one_undo(int semid, struct sem_array *sma, void *arg) +{ + struct sem_undo *su = arg; + struct sem_undo_list *undo_list = current->sysvsem.undo_list; + + if (semid != su->semid) + return 0; + + list_add(&su->list_proc, &undo_list->list_proc); + list_add(&su->list_id, &sma->list_id); + + return 1; +} + +static int attach_undo(struct sem_undo *su) +{ + return sysvipc_walk_sem(attach_one_undo, su); +} + +static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx) +{ + int err; + struct sem_undo_list *undo_list; + + if (current->sysvsem.undo_list) { + eprintk_ctx("Funny undo_list\n"); + return 0; + } + + undo_list = kzalloc(sizeof(struct sem_undo_list), GFP_KERNEL_UBC); + if (undo_list == NULL) + return -ENOMEM; + + atomic_set(&undo_list->refcnt, 1); + spin_lock_init(&undo_list->lock); + INIT_LIST_HEAD(&undo_list->list_proc); + current->sysvsem.undo_list = undo_list; + + if (sui->cpt_next > sui->cpt_hdrlen) { + loff_t offset = pos + sui->cpt_hdrlen; + do { + struct sem_undo *new; + struct cpt_sysvsem_undo_image spi; + err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx); + if (err) + goto out; + new = kmalloc(sizeof(struct sem_undo) + + sizeof(short)*spi.cpt_nsem, + GFP_KERNEL_UBC); + if (!new) { + err = -ENOMEM; + goto out; + } + + memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem); + new->semadj = (short *) &new[1]; + new->semid = spi.cpt_id; + err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen); + if (err) { + kfree(new); + goto out; + } + err = attach_undo(new); + if (err <= 0) { + if (err == 0) + err = -ENOENT; + kfree(new); + goto out; + } + offset += spi.cpt_next; + } while (offset < pos + sui->cpt_next); + } + err = 0; + +out: + return err; +} + +__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + +#if 0 + if (ti->cpt_sysvsem_undo == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo)) + flag |= CLONE_SYSVSEM; +#endif + return flag; +} + +int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err; + struct sem_undo_list *f = current->sysvsem.undo_list; + cpt_object_t *obj; + struct cpt_object_hdr sui; + + if (ti->cpt_sysvsem_undo == CPT_NULL) { + exit_sem(current); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx); + if (obj) { + if (obj->o_obj != f) { + exit_sem(current); + f = obj->o_obj; + atomic_inc(&f->refcnt); + current->sysvsem.undo_list = f; + } + return 0; + } + + if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0) + goto out; + + if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0) + goto out; + + err = -ENOMEM; + obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx); + if (obj) { + err = 0; + cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx); + } + + return 0; + +out: + return err; +} + +struct _sarg { + int semid; + struct cpt_sysvsem_image *v; + __u32 *arr; +}; + +static int fixup_one_sem(int semid, struct sem_array *sma, void *arg) +{ + struct _sarg *warg = arg; + + if (semid != warg->semid) + return 0; + + sma->sem_perm.uid = warg->v->cpt_uid; + sma->sem_perm.gid = warg->v->cpt_gid; + sma->sem_perm.cuid = warg->v->cpt_cuid; + sma->sem_perm.cgid = warg->v->cpt_cgid; + sma->sem_perm.mode = warg->v->cpt_mode; + sma->sem_perm.seq = warg->v->cpt_seq; + + sma->sem_ctime = warg->v->cpt_ctime; + sma->sem_otime = warg->v->cpt_otime; + memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8); + return 1; +} + +static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr) +{ + struct _sarg warg; + + warg.semid = semid; + warg.v = v; + warg.arr = arr; + + return sysvipc_walk_sem(fixup_one_sem, &warg); +} + + +static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si, + struct cpt_context *ctx) +{ + int err; + __u32 *arr; + int nsems = (si->cpt_next - si->cpt_hdrlen)/8; + + arr = kmalloc(nsems*8, GFP_KERNEL); + if (!arr) + return -ENOMEM; + + err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen); + if (err) + goto out; + err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode); + if (err < 0) { + eprintk_ctx("SEM 3\n"); + goto out; + } + err = fixup_sem(si->cpt_id, si, arr); + if (err == 0) + err = -ESRCH; + if (err > 0) + err = 0; +out: + kfree(arr); + return err; +} + +static int rst_sysv_sem(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_sysvsem_image sbuf; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx); + if (err) + return err; + err = restore_sem(sec, &sbuf, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + return 0; +} + +struct _marg { + int msqid; + struct cpt_sysvmsg_image *v; + struct msg_queue *m; +}; + +static int fixup_one_msg(int msqid, struct msg_queue *msq, void *arg) +{ + struct _marg *warg = arg; + + if (msqid != warg->msqid) + return 0; + + msq->q_perm.uid = warg->v->cpt_uid; + msq->q_perm.gid = warg->v->cpt_gid; + msq->q_perm.cuid = warg->v->cpt_cuid; + msq->q_perm.cgid = warg->v->cpt_cgid; + msq->q_perm.mode = warg->v->cpt_mode; + msq->q_perm.seq = warg->v->cpt_seq; + + msq->q_stime = warg->v->cpt_stime; + msq->q_rtime = warg->v->cpt_rtime; + msq->q_ctime = warg->v->cpt_ctime; + msq->q_lspid = warg->v->cpt_last_sender; + msq->q_lrpid = warg->v->cpt_last_receiver; + msq->q_qbytes = warg->v->cpt_qbytes; + + warg->m = msq; + return 1; +} + +struct _larg +{ + cpt_context_t * ctx; + loff_t pos; +}; + +static int do_load_msg(void * dst, int len, int offset, void * data) +{ + struct _larg * arg = data; + return arg->ctx->pread(dst, len, arg->ctx, arg->pos + offset); +} + +static int fixup_msg(int msqid, struct cpt_sysvmsg_image *v, loff_t pos, + cpt_context_t * ctx) +{ + int err; + struct _marg warg; + loff_t endpos = pos + v->cpt_next; + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + + pos += v->cpt_hdrlen; + + warg.msqid = msqid; + warg.v = v; + + err = sysvipc_walk_msg(fixup_one_msg, &warg); + if (err <= 0) + return err; + + while (pos < endpos) { + struct cpt_sysvmsg_msg_image mi; + struct msg_msg *m; + struct _larg data = { + .ctx = ctx + }; + + err = rst_get_object(CPT_OBJ_SYSVMSG_MSG, pos, &mi, ctx); + if (err) + return err; + data.pos = pos + mi.cpt_hdrlen; + m = sysv_msg_load(do_load_msg, mi.cpt_size, &data); + if (IS_ERR(m)) + return PTR_ERR(m); + m->m_type = mi.cpt_type; + m->m_ts = mi.cpt_size; + list_add_tail(&m->m_list, &warg.m->q_messages); + warg.m->q_cbytes += m->m_ts; + warg.m->q_qnum++; + atomic_add(m->m_ts, &ns->msg_bytes); + atomic_inc(&ns->msg_hdrs); + + pos += mi.cpt_next; + } + return 1; +} + +static int restore_msg(loff_t pos, struct cpt_sysvmsg_image *si, + struct cpt_context *ctx) +{ + int err; + + err = sysvipc_setup_msg(si->cpt_key, si->cpt_id, si->cpt_mode); + if (err < 0) { + eprintk_ctx("MSG 3\n"); + goto out; + } + err = fixup_msg(si->cpt_id, si, pos, ctx); + if (err == 0) + err = -ESRCH; + if (err > 0) + err = 0; +out: + return err; +} + +static int rst_sysv_msg(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SYSV_MSG]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_sysvmsg_image sbuf; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_SYSV_MSG || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + err = rst_get_object(CPT_OBJ_SYSVMSG, sec, &sbuf, ctx); + if (err) + return err; + err = restore_msg(sec, &sbuf, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + return 0; +} + + +int rst_sysv_ipc(struct cpt_context *ctx) +{ + int err; + + err = rst_sysv_sem(ctx); + if (!err) + err = rst_sysv_msg(ctx); + + return err; +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_tty.c linux-2.6.32.48-openvz/kernel/cpt/rst_tty.c --- linux-2.6.32.48/kernel/cpt/rst_tty.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_tty.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,384 @@ +/* + * + * kernel/cpt/rst_tty.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_files.h" +#include "cpt_kernel.h" + +static int pty_setup(struct tty_struct *stty, loff_t pos, + struct cpt_tty_image *pi, struct cpt_context *ctx) +{ + unsigned long flags; + + stty->pgrp = NULL; + stty->session = NULL; + stty->packet = pi->cpt_packet; + stty->stopped = pi->cpt_stopped; + stty->hw_stopped = pi->cpt_hw_stopped; + stty->flow_stopped = pi->cpt_flow_stopped; +#define DONOT_CHANGE ((1<flags & DONOT_CHANGE; + stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE); + stty->ctrl_status = pi->cpt_ctrl_status; + stty->winsize.ws_row = pi->cpt_ws_row; + stty->winsize.ws_col = pi->cpt_ws_col; + stty->winsize.ws_ypixel = pi->cpt_ws_prow; + stty->winsize.ws_xpixel = pi->cpt_ws_pcol; + stty->canon_column = pi->cpt_canon_column; + stty->column = pi->cpt_column; + stty->raw = pi->cpt_raw; + stty->real_raw = pi->cpt_real_raw; + stty->erasing = pi->cpt_erasing; + stty->lnext = pi->cpt_lnext; + stty->icanon = pi->cpt_icanon; + stty->closing = pi->cpt_closing; + stty->minimum_to_wake = pi->cpt_minimum_to_wake; + + stty->termios->c_iflag = pi->cpt_c_iflag; + stty->termios->c_oflag = pi->cpt_c_oflag; + stty->termios->c_lflag = pi->cpt_c_lflag; + stty->termios->c_cflag = pi->cpt_c_cflag; + memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS); + memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags)); + + if (pi->cpt_next > pi->cpt_hdrlen) { + int err; + struct cpt_obj_bits b; + err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx); + if (err) + return err; + if (b.cpt_size == 0) + return 0; + err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen); + if (err) + return err; + + spin_lock_irq(&stty->read_lock); + stty->read_tail = 0; + stty->read_cnt = b.cpt_size; + stty->read_head = b.cpt_size; + stty->canon_head = stty->read_tail + pi->cpt_canon_head; + stty->canon_data = pi->cpt_canon_data; + spin_unlock_irq(&stty->read_lock); + } + + return 0; +} + +/* Find slave/master tty in image, when we already know master/slave. + * It might be optimized, of course. */ +static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_TTY]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_tty_image *pibuf; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return CPT_NULL; + if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) + return CPT_NULL; + pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL); + if (pibuf == NULL) { + eprintk_ctx("cannot allocate buffer\n"); + return CPT_NULL; + } + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) + return CPT_NULL; + if (pibuf->cpt_index == pi->cpt_index && + !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) && + pos != sec) { + pty_setup(stty, sec, pibuf, ctx); + return sec; + } + sec += pibuf->cpt_next; + } + kfree(pibuf); + return CPT_NULL; +} + +static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master, + struct cpt_context *ctx) +{ + int err; + struct iattr newattrs; + struct dentry *d = master->f_dentry; + + newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; + newattrs.ia_uid = ii->cpt_uid; + newattrs.ia_gid = ii->cpt_gid; + newattrs.ia_mode = ii->cpt_mode; + + mutex_lock(&d->d_inode->i_mutex); + err = notify_change(d, &newattrs); + mutex_unlock(&d->d_inode->i_mutex); + + return err; +} + +/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open + * /dev/ptmx until we get pty with desired index. + */ + +struct file *ptmx_open(int index, unsigned int flags) +{ + struct file *file; + struct file **stack = NULL; + int depth = 0; + + for (;;) { + struct tty_struct *tty; + + file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + if (IS_ERR(file)) + break; + tty = file->private_data; + if (tty->index == index) + break; + + if (depth == PAGE_SIZE/sizeof(struct file *)) { + fput(file); + file = ERR_PTR(-EBUSY); + break; + } + if (stack == NULL) { + stack = (struct file **)__get_free_page(GFP_KERNEL); + if (!stack) { + fput(file); + file = ERR_PTR(-ENOMEM); + break; + } + } + stack[depth] = file; + depth++; + } + while (depth > 0) { + depth--; + fput(stack[depth]); + } + if (stack) + free_page((unsigned long)stack); + return file; +} + + +struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, + unsigned flags, struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct file *master, *slave; + struct tty_struct *stty; + struct cpt_tty_image *pi; + static char *a = "pqrstuvwxyzabcde"; + static char *b = "0123456789abcdef"; + char pairname[16]; + unsigned master_flags, slave_flags; + + if (fi->cpt_priv == CPT_NULL) + return ERR_PTR(-EINVAL); + + obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx); + if (obj && obj->o_parent) { + dprintk_ctx("obtained pty as pair to existing\n"); + master = obj->o_parent; + stty = master->private_data; + + if (stty->driver->subtype == PTY_TYPE_MASTER && + (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) { + wprintk_ctx("cloning ptmx\n"); + get_file(master); + return master; + } + + master = dentry_open(dget(master->f_dentry), + mntget(master->f_vfsmnt), flags, NULL); + if (!IS_ERR(master)) { + stty = master->private_data; + if (stty->driver->subtype != PTY_TYPE_MASTER) + fixup_tty_attrs(ii, master, ctx); + } + return master; + } + + pi = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx); + if (err) { + cpt_release_buf(ctx); + return ERR_PTR(err); + } + + master_flags = slave_flags = 0; + if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) + master_flags = flags; + else + slave_flags = flags; + + /* + * Open pair master/slave. + */ + if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) { + master = ptmx_open(pi->cpt_index, master_flags); + } else { + sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]); + master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + } + if (IS_ERR(master)) { + eprintk_ctx("filp_open master: %Ld %ld\n", (long long)fi->cpt_priv, PTR_ERR(master)); + cpt_release_buf(ctx); + return master; + } + stty = master->private_data; + clear_bit(TTY_PTY_LOCK, &stty->flags); + if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) + sprintf(pairname, "/dev/pts/%d", stty->index); + else + sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]); + slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + if (IS_ERR(slave)) { + eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave)); + fput(master); + cpt_release_buf(ctx); + return slave; + } + + if (pi->cpt_drv_subtype != PTY_TYPE_MASTER) + fixup_tty_attrs(ii, slave, ctx); + + cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx); + cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx); + cpt_object_add(CPT_OBJ_FILE, master, ctx); + cpt_object_add(CPT_OBJ_FILE, slave, ctx); + + if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) { + loff_t pos; + obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); + obj->o_parent = master; + cpt_obj_setpos(obj, fi->cpt_priv, ctx); + pty_setup(stty, fi->cpt_priv, pi, ctx); + + obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); + obj->o_parent = slave; + pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx); + cpt_obj_setpos(obj, pos, ctx); + + obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx); + cpt_obj_setpos(obj, CPT_NULL, ctx); + get_file(master); + cpt_release_buf(ctx); + return master; + } else { + loff_t pos; + obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); + obj->o_parent = slave; + cpt_obj_setpos(obj, fi->cpt_priv, ctx); + pty_setup(stty->link, fi->cpt_priv, pi, ctx); + + obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); + obj->o_parent = master; + pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx); + cpt_obj_setpos(obj, pos, ctx); + + obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx); + cpt_obj_setpos(obj, CPT_NULL, ctx); + get_file(slave); + cpt_release_buf(ctx); + return slave; + } +} + +int rst_tty_jobcontrol(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_TTY]; + loff_t endsec; + struct cpt_section_hdr h; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_tty_image *pibuf = cpt_get_buf(ctx); + + if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) { + cpt_release_buf(ctx); + return -EINVAL; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx); + if (obj) { + struct tty_struct *stty = obj->o_obj; + if ((int)pibuf->cpt_pgrp > 0) { + rcu_read_lock(); + stty->pgrp = get_pid(alloc_vpid_safe(pibuf->cpt_pgrp)); + rcu_read_unlock(); + if (!stty->pgrp) + dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp); + } else if (pibuf->cpt_pgrp) { + stty->pgrp = alloc_pid(current->nsproxy->pid_ns, + 0); + if (!stty->pgrp) { + eprintk_ctx("cannot allocate stray tty->pgrp"); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if ((int)pibuf->cpt_session > 0) { + struct pid *sess; + + rcu_read_lock(); + sess = get_pid(alloc_vpid_safe(pibuf->cpt_session)); + rcu_read_unlock(); + if (!sess) { + dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session); + } else if (!stty->session) { + stty->session = sess; + } + } + } + sec += pibuf->cpt_next; + cpt_release_buf(ctx); + } + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_ubc.c linux-2.6.32.48-openvz/kernel/cpt/rst_ubc.c --- linux-2.6.32.48/kernel/cpt/rst_ubc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_ubc.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,144 @@ +/* + * + * kernel/cpt/rst_ubc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx); + if (obj == NULL) { + eprintk("RST: unknown ub @%Ld\n", (long long)pos); + return get_beancounter(get_exec_ub()); + } + return get_beancounter(obj->o_obj); +} + +void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id) +{ + to[bc_parm_id].barrier = from[bc_parm_id].barrier; + to[bc_parm_id].limit = from[bc_parm_id].limit; +} + +void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id) +{ + ubprm[bc_parm_id].barrier = UB_MAXVALUE; + ubprm[bc_parm_id].limit = UB_MAXVALUE; +} + +static void restore_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, + int held) +{ + prm->barrier = (dmp->barrier == CPT_NULL ? UB_MAXVALUE : dmp->barrier); + prm->limit = (dmp->limit == CPT_NULL ? UB_MAXVALUE : dmp->limit); + if (held) + prm->held = dmp->held; + prm->maxheld = dmp->maxheld; + prm->minheld = dmp->minheld; + prm->failcnt = dmp->failcnt; +} + +static int restore_one_bc(struct cpt_beancounter_image *v, + cpt_object_t *obj, struct cpt_context *ctx) +{ + struct user_beancounter *bc; + cpt_object_t *pobj; + int resources, i; + + if (v->cpt_parent != CPT_NULL) { + pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx); + if (pobj == NULL) + return -ESRCH; + bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1); + } else { + bc = get_exec_ub(); + while (bc->parent) + bc = bc->parent; + get_beancounter(bc); + } + if (bc == NULL) + return -ENOMEM; + obj->o_obj = bc; + + if (ctx->image_version < CPT_VERSION_18 && + CPT_VERSION_MINOR(ctx->image_version) < 1) + goto out; + + if (v->cpt_content == CPT_CONTENT_ARRAY) + resources = v->cpt_ub_resources; + else + resources = UB_RESOURCES_COMPAT; + + if (resources > UB_RESOURCES) + return -EINVAL; + + for (i = 0; i < resources; i++) { + restore_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + restore_one_bc_parm(v->cpt_parms + i * 2 + 1, + bc->ub_store + i, 1); + } + +out: + if (!bc->parent) + for (i = 0; i < UB_RESOURCES; i++) + copy_one_ubparm(bc->ub_parms, ctx->saved_ubc, i); + + return 0; +} + +int rst_undump_ubc(struct cpt_context *ctx) +{ + loff_t start, end; + struct cpt_beancounter_image *v; + cpt_object_t *obj; + int err; + + err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end); + if (err) + return err; + + while (start < end) { + v = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_UBC, start, v, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + obj = alloc_cpt_object(GFP_KERNEL, ctx); + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_UBC, obj, ctx); + + err = restore_one_bc(v, obj, ctx); + + cpt_release_buf(ctx); + if (err) + return err; + + start += v->cpt_next; + } + return 0; +} + +void rst_finish_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_UBC) + put_beancounter(obj->o_obj); +} diff -urNp linux-2.6.32.48/kernel/cpt/rst_undump.c linux-2.6.32.48-openvz/kernel/cpt/rst_undump.c --- linux-2.6.32.48/kernel/cpt/rst_undump.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpt/rst_undump.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,1074 @@ +/* + * + * kernel/cpt/rst_undump.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_socket.h" +#include "cpt_net.h" +#include "cpt_ubc.h" +#include "cpt_kernel.h" + +static int rst_utsname(cpt_context_t *ctx); + + +struct thr_context { + struct completion init_complete; + struct completion task_done; + int error; + struct cpt_context *ctx; + cpt_object_t *tobj; +}; + +static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx); + +static int vps_rst_veinfo(struct cpt_context *ctx) +{ + int err; + struct cpt_veinfo_image *i; + struct ve_struct *ve; + struct timespec delta; + loff_t start, end; + struct ipc_namespace *ns; + + err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end); + if (err) + goto out; + + i = cpt_get_buf(ctx); + memset(i, 0, sizeof(*i)); + err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx); + if (err) + goto out_rel; + + ve = get_exec_env(); + ns = ve->ve_ns->ipc_ns; + + /* Damn. Fatal mistake, these two values are size_t! */ + ns->shm_ctlall = i->shm_ctl_all ? : 0xFFFFFFFFU; + ns->shm_ctlmax = i->shm_ctl_max ? : 0xFFFFFFFFU; + ns->shm_ctlmni = i->shm_ctl_mni; + + ns->msg_ctlmax = i->msg_ctl_max; + ns->msg_ctlmni = i->msg_ctl_mni; + ns->msg_ctlmnb = i->msg_ctl_mnb; + + BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); + ns->sem_ctls[0] = i->sem_ctl_arr[0]; + ns->sem_ctls[1] = i->sem_ctl_arr[1]; + ns->sem_ctls[2] = i->sem_ctl_arr[2]; + ns->sem_ctls[3] = i->sem_ctl_arr[3]; + + cpt_timespec_import(&delta, i->start_timespec_delta); + _set_normalized_timespec(&ve->start_timespec, + ve->start_timespec.tv_sec - delta.tv_sec, + ve->start_timespec.tv_nsec - delta.tv_nsec); + ve->start_jiffies -= i->start_jiffies_delta; + // // FIXME: what??? + // // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy; + + ctx->last_vpid = i->last_pid; + if (i->rnd_va_space) + ve->_randomize_va_space = i->rnd_va_space - 1; + + err = 0; +out_rel: + cpt_release_buf(ctx); +out: + return err; +} + +static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err; + struct env_create_param3 param; + + do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); + do_gettimespec(&ctx->delta_time); + + _set_normalized_timespec(&ctx->delta_time, + ctx->delta_time.tv_sec - ctx->start_time.tv_sec, + ctx->delta_time.tv_nsec - ctx->start_time.tv_nsec); + ctx->delta_nsec = (s64)ctx->delta_time.tv_sec*NSEC_PER_SEC + ctx->delta_time.tv_nsec; + if (ctx->delta_nsec < 0) { + wprintk_ctx("Wall time is behind source by %Ld ns, " + "time sensitive applications can misbehave\n", (long long)-ctx->delta_nsec); + } + + _set_normalized_timespec(&ctx->cpt_monotonic_time, + ctx->cpt_monotonic_time.tv_sec - ctx->delta_time.tv_sec, + ctx->cpt_monotonic_time.tv_nsec - ctx->delta_time.tv_nsec); + + memset(¶m, 0, sizeof(param)); + param.iptables_mask = ctx->iptables_mask; + param.feature_mask = ctx->features; + + /* feature_mask is set as required - pretend we know everything */ + param.known_features = (ctx->image_version < CPT_VERSION_18) ? + VE_FEATURES_OLD : ~(__u64)0; + + err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK|VE_EXCLUSIVE, 2, + ¶m, sizeof(param)); + if (err < 0) + eprintk_ctx("real_env_create: %d\n", err); + + get_exec_env()->jiffies_fixup = + (ctx->delta_time.tv_sec < 0 ? + 0 : timespec_to_jiffies(&ctx->delta_time)) - + (unsigned long)(get_jiffies_64() - ctx->virt_jiffies64); + dprintk_ctx("JFixup %ld %Ld\n", get_exec_env()->jiffies_fixup, + (long long)ctx->delta_nsec); + return err < 0 ? err : 0; +} + + +static int rst_creds(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct cred *cred; + struct user_struct *user; + struct group_info *gids; + int i; + + cred = prepare_creds(); + if (cred == NULL) + goto err_cred; + + user = alloc_uid(get_exec_env()->user_ns, ti->cpt_user); + if (user == NULL) + goto err_uid; + + gids = groups_alloc(ti->cpt_ngids); + if (gids == NULL) + goto err_gids; + + free_uid(cred->user); + cred->user = user; + + for (i=0; i<32; i++) + gids->small_block[i] = ti->cpt_gids[i]; + + put_group_info(cred->group_info); + cred->group_info = gids; + + cred->uid = ti->cpt_uid; + cred->euid = ti->cpt_euid; + cred->suid = ti->cpt_suid; + cred->fsuid = ti->cpt_fsuid; + cred->gid = ti->cpt_gid; + cred->egid = ti->cpt_egid; + cred->sgid = ti->cpt_sgid; + cred->fsgid = ti->cpt_fsgid; + + memcpy(&cred->cap_effective, &ti->cpt_ecap, + sizeof(cred->cap_effective)); + memcpy(&cred->cap_inheritable, &ti->cpt_icap, + sizeof(cred->cap_inheritable)); + memcpy(&cred->cap_permitted, &ti->cpt_pcap, + sizeof(cred->cap_permitted)); + + if (ctx->image_version < CPT_VERSION_26) + cred->securebits = (ti->cpt_keepcap != 0) ? + issecure_mask(SECURE_KEEP_CAPS) : 0; + else + cred->securebits = ti->cpt_keepcap; + + commit_creds(cred); + return 0; + +err_gids: + free_uid(user); +err_uid: + abort_creds(cred); +err_cred: + return -ENOMEM; +} + +static int hook(void *arg) +{ + struct thr_context *thr_ctx = arg; + struct cpt_context *ctx; + cpt_object_t *tobj; + struct cpt_task_image *ti; + int err = 0; + int exiting = 0; + + current->state = TASK_UNINTERRUPTIBLE; + complete(&thr_ctx->init_complete); + schedule(); + + ctx = thr_ctx->ctx; + tobj = thr_ctx->tobj; + ti = tobj->o_image; + + current->fs->umask = 0; + + if (ti->cpt_pid == 1) { +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + err = vps_rst_reparent_root(tobj, ctx); + + if (err) { + rst_report_error(err, ctx); + goto out; + } + + memcpy(&get_exec_env()->ve_cap_bset, &ti->cpt_ecap, sizeof(kernel_cap_t)); + + if (ctx->statusfile) { + fput(ctx->statusfile); + ctx->statusfile = NULL; + } + + if (ctx->lockfile) { + char b; + mm_segment_t oldfs; + err = -EINVAL; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (ctx->lockfile->f_op && ctx->lockfile->f_op->read) + err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos); + set_fs(oldfs); + fput(ctx->lockfile); + ctx->lockfile = NULL; + } + + if (err) { + eprintk_ctx("CPT: lock fd is closed incorrectly: %d\n", err); + goto out; + } + err = vps_rst_veinfo(ctx); + if (err) { + eprintk_ctx("rst_veinfo: %d\n", err); + goto out; + } + + err = rst_utsname(ctx); + if (err) { + eprintk_ctx("rst_utsname: %d\n", err); + goto out; + } + + err = rst_files_std(ti, ctx); + if (err) { + eprintk_ctx("rst_root_stds: %d\n", err); + goto out; + } + + err = rst_root_namespace(ctx); + if (err) { + eprintk_ctx("rst_namespace: %d\n", err); + goto out; + } + + if ((err = rst_restore_net(ctx)) != 0) { + eprintk_ctx("rst_restore_net: %d\n", err); + goto out; + } + + err = rst_sockets(ctx); + if (err) { + eprintk_ctx("rst_sockets: %d\n", err); + goto out; + } + err = rst_sysv_ipc(ctx); + if (err) { + eprintk_ctx("rst_sysv_ipc: %d\n", err); + goto out; + } +#ifdef CONFIG_BEANCOUNTERS + bc = get_exec_ub(); + set_one_ubparm_to_max(bc->ub_parms, UB_KMEMSIZE); + set_one_ubparm_to_max(bc->ub_parms, UB_NUMPROC); + set_one_ubparm_to_max(bc->ub_parms, UB_NUMFILE); + set_one_ubparm_to_max(bc->ub_parms, UB_DCACHESIZE); +#endif + } + + if ((err = rst_creds(ti, ctx)) != 0) { + eprintk_ctx("rst_creds: %d\n", err); + goto out; + } + + if ((err = rst_mm_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_mm: %d\n", err); + goto out; + } + + if ((err = rst_files_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_files: %d\n", err); + goto out; + } + + if ((err = rst_fs_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_fs: %d\n", err); + goto out; + } + + if ((err = rst_semundo_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_semundo: %d\n", err); + goto out; + } + + if ((err = rst_signal_complete(ti, &exiting, ctx)) != 0) { + eprintk_ctx("rst_signal: %d\n", err); + goto out; + } + + if (ti->cpt_personality != 0) + __set_personality(ti->cpt_personality); + +#ifdef CONFIG_X86_64 + /* 32bit app from 32bit OS, won't have PER_LINUX32 set... :/ */ + if (!ti->cpt_64bit) + __set_personality(PER_LINUX32); +#endif + + current->set_child_tid = NULL; + current->clear_child_tid = NULL; + current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV); + current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV); + current->exit_code = ti->cpt_exit_code; + current->pdeath_signal = ti->cpt_pdeath_signal; + + if (ti->cpt_restart.fn != CPT_RBL_0) { + if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP + || ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP + ) { + struct restart_block *rb; + ktime_t e; + + e.tv64 = 0; + + if (ctx->image_version >= CPT_VERSION_20) + e = ktime_add_ns(e, ti->cpt_restart.arg2); + else if (ctx->image_version >= CPT_VERSION_9) + e = ktime_add_ns(e, ti->cpt_restart.arg0); + else + e = ktime_add_ns(e, ti->cpt_restart.arg0*TICK_NSEC); + if (e.tv64 < 0) + e.tv64 = TICK_NSEC; + e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + + rb = &task_thread_info(current)->restart_block; + rb->fn = hrtimer_nanosleep_restart; +#ifdef CONFIG_COMPAT + if (ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP) + rb->fn = compat_nanosleep_restart; +#endif + if (ctx->image_version >= CPT_VERSION_20) { + rb->arg0 = ti->cpt_restart.arg0; + rb->arg1 = ti->cpt_restart.arg1; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } else if (ctx->image_version >= CPT_VERSION_9) { + rb->arg0 = ti->cpt_restart.arg2; + rb->arg1 = ti->cpt_restart.arg3; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } else { + rb->arg0 = ti->cpt_restart.arg1; + rb->arg1 = CLOCK_MONOTONIC; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } + } else if (ti->cpt_restart.fn == CPT_RBL_POLL) { + struct restart_block *rb; + ktime_t e; + struct timespec ts; + unsigned long timeout_jiffies; + + e.tv64 = 0; + e = ktime_add_ns(e, ti->cpt_restart.arg2); + e = ktime_sub(e, timespec_to_ktime(ctx->delta_time)); + ts = ns_to_timespec(ktime_to_ns(e)); + timeout_jiffies = timespec_to_jiffies(&ts); + + rb = &task_thread_info(current)->restart_block; + rb->fn = do_restart_poll; + rb->arg0 = ti->cpt_restart.arg0; + rb->arg1 = ti->cpt_restart.arg1; + rb->arg2 = timeout_jiffies & 0xFFFFFFFF; + rb->arg3 = (u64)timeout_jiffies >> 32; + } else if (ti->cpt_restart.fn == CPT_RBL_FUTEX_WAIT) { + struct restart_block *rb; + ktime_t e; + + e.tv64 = 0; + e = ktime_add_ns(e, ti->cpt_restart.arg2); + e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + + rb = &task_thread_info(current)->restart_block; + rb->fn = futex_wait_restart; + rb->futex.uaddr = (void *)(unsigned long)ti->cpt_restart.arg0; + rb->futex.val = ti->cpt_restart.arg1; + rb->futex.time = e.tv64; + rb->futex.flags = ti->cpt_restart.arg3; + } else + eprintk_ctx("unknown restart block (%d)\n", ti->cpt_restart.fn); + } + + if (thread_group_leader(current)) { + current->signal->it_real_incr.tv64 = 0; + if (ctx->image_version >= CPT_VERSION_9) { + current->signal->it_real_incr = + ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr); + } else { + current->signal->it_real_incr = + ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC); + } + current->signal->it[CPUCLOCK_PROF].incr = ti->cpt_it_prof_incr; + current->signal->it[CPUCLOCK_VIRT].incr = ti->cpt_it_virt_incr; + current->signal->it[CPUCLOCK_PROF].expires = ti->cpt_it_prof_value; + current->signal->it[CPUCLOCK_VIRT].expires = ti->cpt_it_virt_value; + } + + err = rst_clone_children(tobj, ctx); + if (err) { + eprintk_ctx("rst_clone_children\n"); + goto out; + } + + if (exiting) + current->signal->flags |= SIGNAL_GROUP_EXIT; + + if (ti->cpt_pid == 1) { + if ((err = rst_process_linkage(ctx)) != 0) { + eprintk_ctx("rst_process_linkage: %d\n", err); + goto out; + } + if ((err = rst_do_filejobs(ctx)) != 0) { + eprintk_ctx("rst_do_filejobs: %d\n", err); + goto out; + } + if ((err = rst_eventpoll(ctx)) != 0) { + eprintk_ctx("rst_eventpoll: %d\n", err); + goto out; + } +#ifdef CONFIG_INOTIFY_USER + if ((err = rst_inotify(ctx)) != 0) { + eprintk_ctx("rst_inotify: %d\n", err); + goto out; + } +#endif + if ((err = rst_sockets_complete(ctx)) != 0) { + eprintk_ctx("rst_sockets_complete: %d\n", err); + goto out; + } + if ((err = rst_stray_files(ctx)) != 0) { + eprintk_ctx("rst_stray_files: %d\n", err); + goto out; + } + if ((err = rst_posix_locks(ctx)) != 0) { + eprintk_ctx("rst_posix_locks: %d\n", err); + goto out; + } + if ((err = rst_tty_jobcontrol(ctx)) != 0) { + eprintk_ctx("rst_tty_jobcontrol: %d\n", err); + goto out; + } + if ((err = rst_restore_fs(ctx)) != 0) { + eprintk_ctx("rst_restore_fs: %d\n", err); + goto out; + } + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RESTORE, ctx) & NOTIFY_FAIL) { + err = -ECHRNG; + eprintk_ctx("scp_restore failed\n"); + goto out; + } + if (ctx->last_vpid) + get_exec_env()->ve_ns->pid_ns->last_pid = + ctx->last_vpid; + } + +out: + thr_ctx->error = err; + complete(&thr_ctx->task_done); + + if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + current->flags |= PF_EXIT_RESTART; + do_exit(ti->cpt_exit_code); + } else { + __set_current_state(TASK_UNINTERRUPTIBLE); + } + + schedule(); + + dprintk_ctx("leaked through %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm); + + module_put(THIS_MODULE); + complete_and_exit(NULL, 0); + return 0; +} + +#if 0 +static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct task_beancounter *tbc; + + tbc = task_bc(current); + + put_beancounter(tbc->fork_sub); + tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx); + if (ti->cpt_mm_ub != CPT_NULL) { + put_beancounter(tbc->exec_ub); + tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx); + } +} +#endif + +static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx, + struct thr_context *thr_ctx) +{ + struct task_struct *tsk; + int pid; + + thr_ctx->ctx = ctx; + thr_ctx->error = 0; + init_completion(&thr_ctx->init_complete); + init_completion(&thr_ctx->task_done); +#if 0 + set_task_ubs(obj->o_image, ctx); +#endif + + pid = local_kernel_thread(hook, thr_ctx, 0, 0); + if (pid < 0) + return pid; + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (tsk == NULL) + return -ESRCH; + cpt_obj_setobj(obj, tsk, ctx); + thr_ctx->tobj = obj; + return 0; +} + +static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm)); + rst_mm_basic(obj, ti, ctx); + return 0; +} + +static int make_baby(cpt_object_t *cobj, + struct cpt_task_image *pi, + struct cpt_context *ctx) +{ + unsigned long flags; + struct cpt_task_image *ci = cobj->o_image; + struct thr_context thr_ctx; + struct task_struct *tsk; + pid_t pid; + struct fs_struct *tfs = NULL; + + flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx) + | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx); + if (ci->cpt_rppid != pi->cpt_pid) { + flags |= CLONE_THREAD|CLONE_PARENT; + if (ci->cpt_signal != pi->cpt_signal || + !(flags&CLONE_SIGHAND) || + (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) { + eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n", + (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid, + (long long)ci->cpt_signal, (long long)pi->cpt_signal, flags + ); + return -EINVAL; + } + } + + thr_ctx.ctx = ctx; + thr_ctx.error = 0; + init_completion(&thr_ctx.init_complete); + init_completion(&thr_ctx.task_done); + thr_ctx.tobj = cobj; + +#if 0 + set_task_ubs(ci, ctx); +#endif + + if (current->fs == NULL) { + tfs = get_exec_env()->ve_ns->pid_ns->child_reaper->fs; + if (tfs == NULL) + return -EINVAL; + write_lock(&tfs->lock); + tfs->users++; + write_unlock(&tfs->lock); + current->fs = tfs; + } + pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid); + if (tfs) { + current->fs = NULL; + write_lock(&tfs->lock); + tfs->users--; + WARN_ON(tfs->users == 0); + write_unlock(&tfs->lock); + } + if (pid < 0) + return pid; + + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (tsk == NULL) + return -ESRCH; + cpt_obj_setobj(cobj, tsk, ctx); + thr_ctx.tobj = cobj; + wait_for_completion(&thr_ctx.init_complete); + wait_task_inactive(cobj->o_obj, 0); + rst_basic_init_task(cobj, ctx); + + /* clone() increases group_stop_count if it was not zero and + * CLONE_THREAD was asked. Undo. + */ + if (current->signal->group_stop_count && (flags & CLONE_THREAD)) { + if (tsk->signal != current->signal) BUG(); + current->signal->group_stop_count--; + } + + wake_up_process(tsk); + wait_for_completion(&thr_ctx.task_done); + wait_task_inactive(tsk, 0); + + return thr_ctx.error; +} + +static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err = 0; + struct cpt_task_image *ti = obj->o_image; + cpt_object_t *cobj; + + for_each_object(cobj, CPT_OBJ_TASK) { + struct cpt_task_image *ci = cobj->o_image; + if (cobj == obj) + continue; + if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) || + (ci->cpt_leader == ti->cpt_pid && + ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) { + err = make_baby(cobj, ti, ctx); + if (err) { + eprintk_ctx("make_baby: %d\n", err); + return err; + } + } + } + return 0; +} + +static int read_task_images(struct cpt_context *ctx) +{ + int err; + loff_t start, end; + + err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end); + if (err) + return err; + + while (start < end) { + cpt_object_t *obj; + struct cpt_task_image *ti = cpt_get_buf(ctx); + + err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } +#if 0 + if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) { + eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + obj = alloc_cpt_object(GFP_KERNEL, ctx); + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_TASK, obj, ctx); + obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL); + if (obj->o_image == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + memcpy(obj->o_image, ti, sizeof(*ti)); + err = ctx->pread(obj->o_image + sizeof(*ti), + ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti)); + cpt_release_buf(ctx); + if (err) + return err; + start += ti->cpt_next; + } + return 0; +} + + +static int vps_rst_restore_tree(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct thr_context thr_ctx_root; + + err = read_task_images(ctx); + if (err) + return err; + + err = rst_undump_ubc(ctx); + if (err) + return err; + + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTCHECK, ctx) & NOTIFY_FAIL) + return -ECHRNG; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = rst_setup_pagein(ctx); + if (err) + return err; +#endif + for_each_object(obj, CPT_OBJ_TASK) { + err = create_root_task(obj, ctx, &thr_ctx_root); + if (err) + return err; + + wait_for_completion(&thr_ctx_root.init_complete); + wait_task_inactive(obj->o_obj, 0); + rst_basic_init_task(obj, ctx); + + wake_up_process(obj->o_obj); + wait_for_completion(&thr_ctx_root.task_done); + wait_task_inactive(obj->o_obj, 0); + err = thr_ctx_root.error; + if (err) + return err; + break; + } + + return err; +} + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +int rst_read_vdso(struct cpt_context *ctx) +{ + int err; + loff_t start, end; + struct cpt_page_block *pgb; + + ctx->vdso = NULL; + err = rst_get_section(CPT_SECT_VSYSCALL, ctx, &start, &end); + if (err) + return err; + if (start == CPT_NULL) + return 0; + if (end < start + sizeof(*pgb) + PAGE_SIZE) + return -EINVAL; + + pgb = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_VSYSCALL, start, pgb, ctx); + if (err) { + goto err_buf; + } + ctx->vdso = (char*)__get_free_page(GFP_KERNEL); + if (ctx->vdso == NULL) { + err = -ENOMEM; + goto err_buf; + } + err = ctx->pread(ctx->vdso, PAGE_SIZE, ctx, start + sizeof(*pgb)); + if (err) + goto err_page; + if (!memcmp(ctx->vdso, vsyscall_addr, PAGE_SIZE)) { + free_page((unsigned long)ctx->vdso); + ctx->vdso = NULL; + } + + cpt_release_buf(ctx); + return 0; +err_page: + free_page((unsigned long)ctx->vdso); + ctx->vdso = NULL; +err_buf: + cpt_release_buf(ctx); + return err; +} +#endif + +int vps_rst_undump(struct cpt_context *ctx) +{ + int err; + unsigned long umask; + + err = rst_open_dumpfile(ctx); + if (err) + return err; + + if (ctx->tasks64) { +#if defined(CONFIG_IA64) + if (ctx->image_arch != CPT_OS_ARCH_IA64) +#elif defined(CONFIG_X86_64) + if (ctx->image_arch != CPT_OS_ARCH_EMT64) +#else + if (1) +#endif + { + eprintk_ctx("Cannot restore 64 bit container on this architecture\n"); + return -EINVAL; + } + } + + umask = current->fs->umask; + current->fs->umask = 0; + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = rst_setup_pagein(ctx); +#endif +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) + if (err == 0) + err = rst_read_vdso(ctx); +#endif + if (err == 0) + err = vps_rst_restore_tree(ctx); + + if (err == 0) + err = rst_restore_process(ctx); + + if (err) + virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTFAIL, ctx); + + current->fs->umask = umask; + + return err; +} + +static int rst_unlock_ve(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + down_write(&env->op_sem); + env->is_locked = 0; + up_write(&env->op_sem); + put_ve(env); + return 0; +} + +int recalc_sigpending_tsk(struct task_struct *t); + +int rst_resume(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int err = 0; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + fput(file); + } + +#ifdef CONFIG_BEANCOUNTERS + bc = get_beancounter_byuid(ctx->ve_id, 0); + BUG_ON(!bc); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_KMEMSIZE); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMPROC); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMFILE); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_DCACHESIZE); + put_beancounter(bc); +#endif + + rst_resume_network(ctx); + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + if (!tsk) + continue; + + if (ti->cpt_state == TASK_UNINTERRUPTIBLE) { + dprintk_ctx("task %d/%d(%s) is started\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + + /* Weird... If a signal is sent to stopped task, + * nobody makes recalc_sigpending(). We have to do + * this by hands after wake_up_process(). + * if we did this before a signal could arrive before + * wake_up_process() and stall. + */ + spin_lock_irq(&tsk->sighand->siglock); + if (!signal_pending(tsk)) + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + } else { + if (ti->cpt_state == TASK_STOPPED || + ti->cpt_state == TASK_TRACED) { + set_task_state(tsk, ti->cpt_state); + } + } + put_task_struct(tsk); + } + + rst_unlock_ve(ctx); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + rst_complete_pagein(ctx, 0); +#endif + + rst_finish_ubc(ctx); + cpt_object_destroy(ctx); + + return err; +} + +int rst_kill(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int err = 0; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + fput(file); + } + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + if (tsk == NULL) + continue; + + if (tsk->exit_state == 0) { + send_sig(SIGKILL, tsk, 1); + + spin_lock_irq(&tsk->sighand->siglock); + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(tsk, TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_FREEZE); + if (tsk->flags & PF_FROZEN) + tsk->flags &= ~PF_FROZEN; + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + } + + put_task_struct(tsk); + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + rst_complete_pagein(ctx, 1); +#endif + + rst_finish_ubc(ctx); + cpt_object_destroy(ctx); + + return err; +} + +static int rst_utsname(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_UTSNAME]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr o; + struct ve_struct *ve; + struct uts_namespace *ns; + int i; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + ve = get_exec_env(); + ns = ve->ve_ns->uts_ns; + + i = 0; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int len; + char *ptr; + err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx); + if (err) + return err; + len = o.cpt_next - o.cpt_hdrlen; + if (len > __NEW_UTS_LEN + 1) + return -ENAMETOOLONG; + switch (i) { + case 0: + ptr = ns->name.nodename; break; + case 1: + ptr = ns->name.domainname; break; + default: + return -EINVAL; + } + err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen); + if (err) + return err; + i++; + sec += o.cpt_next; + } + + return 0; +} diff -urNp linux-2.6.32.48/kernel/cpu.c linux-2.6.32.48-openvz/kernel/cpu.c --- linux-2.6.32.48/kernel/cpu.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cpu.c 2011-11-21 17:40:47.000000000 -0500 @@ -150,7 +150,7 @@ static inline void check_for_tasks(int c struct task_struct *p; write_lock_irq(&tasklist_lock); - for_each_process(p) { + for_each_process_all(p) { if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) diff -urNp linux-2.6.32.48/kernel/cred.c linux-2.6.32.48-openvz/kernel/cred.c --- linux-2.6.32.48/kernel/cred.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/cred.c 2011-11-21 17:40:47.000000000 -0500 @@ -62,6 +62,7 @@ struct cred init_cred = { .tgcred = &init_tgcred, #endif }; +EXPORT_SYMBOL_GPL(init_cred); static inline void set_cred_subscribers(struct cred *cred, int n) { diff -urNp linux-2.6.32.48/kernel/exit.c linux-2.6.32.48-openvz/kernel/exit.c --- linux-2.6.32.48/kernel/exit.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/exit.c 2011-11-21 17:40:47.000000000 -0500 @@ -22,6 +22,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -50,12 +53,15 @@ #include #include +#include +#include + #include #include #include #include -static void exit_mm(struct task_struct * tsk); +void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) { @@ -66,6 +72,9 @@ static void __unhash_process(struct task detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); +#ifdef CONFIG_VE + list_del_rcu(&p->ve_task_info.vetask_list); +#endif __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); @@ -184,6 +193,8 @@ repeat: write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); + nr_zombie--; + atomic_inc(&nr_dead); /* * If we are the last non-leader member of the thread @@ -212,9 +223,12 @@ repeat: if (zap_leader) leader->exit_state = EXIT_DEAD; } + put_task_fairsched_node(p); write_unlock_irq(&tasklist_lock); release_thread(p); + ub_task_uncharge(p); + pput_ve(p->ve_task_info.owner_env); call_rcu(&p->rcu, delayed_put_task_struct); p = leader; @@ -429,6 +443,8 @@ void daemonize(const char *name, ...) va_list args; sigset_t blocked; + (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL); + va_start(args, name); vsnprintf(current->comm, sizeof(current->comm), name, args); va_end(args); @@ -533,6 +549,7 @@ void put_files_struct(struct files_struc free_fdtable(fdt); } } +EXPORT_SYMBOL_GPL(put_files_struct); void reset_files_struct(struct files_struct *files) { @@ -605,10 +622,10 @@ retry: * Search through everything else. We should not get * here often */ - do_each_thread(g, c) { + do_each_thread_all(g, c) { if (c->mm == mm) goto assign_new_owner; - } while_each_thread(g, c); + } while_each_thread_all(g, c); read_unlock(&tasklist_lock); /* @@ -647,7 +664,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -655,6 +672,10 @@ static void exit_mm(struct task_struct * mm_release(tsk, mm); if (!mm) return; + + if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) + mm->oom_killed = 1; + /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state @@ -699,6 +720,7 @@ static void exit_mm(struct task_struct * mm_update_next_owner(mm); mmput(mm); } +EXPORT_SYMBOL_GPL(exit_mm); /* * When we die, we re-parent all our children. @@ -713,7 +735,7 @@ static struct task_struct *find_new_reap struct task_struct *thread; thread = father; - while_each_thread(father, thread) { + while_each_thread_ve(father, thread) { if (thread->flags & PF_EXITING) continue; if (unlikely(pid_ns->child_reaper == father)) @@ -846,11 +868,16 @@ static void exit_notify(struct task_stru tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; + if (tsk->exit_signal != -1 && tsk == init_pid_ns.child_reaper) + /* We dont want people slaying init. */ + tsk->exit_signal = SIGCHLD; + signal = tracehook_notify_death(tsk, &cookie, group_dead); if (signal >= 0) signal = do_notify_parent(tsk, signal); tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + nr_zombie++; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -916,6 +943,7 @@ NORET_TYPE void do_exit(long code) set_fs(USER_DS); tracehook_report_exit(&code); + (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL); validate_creds_for_do_exit(tsk); @@ -999,7 +1027,15 @@ NORET_TYPE void do_exit(long code) */ perf_event_exit_task(tsk); - exit_notify(tsk, group_dead); + if (!(tsk->flags & PF_EXIT_RESTART)) + exit_notify(tsk, group_dead); + else { + write_lock_irq(&tasklist_lock); + tsk->exit_state = EXIT_ZOMBIE; + nr_zombie++; + write_unlock_irq(&tasklist_lock); + exit_task_namespaces(tsk); + } #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; @@ -1645,7 +1681,7 @@ repeat: if (wo->wo_flags & __WNOTHREAD) break; - } while_each_thread(current, tsk); + } while_each_thread_ve(current, tsk); read_unlock(&tasklist_lock); notask: @@ -1772,6 +1808,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } +EXPORT_SYMBOL_GPL(sys_wait4); #ifdef __ARCH_WANT_SYS_WAITPID diff -urNp linux-2.6.32.48/kernel/fairsched.c linux-2.6.32.48-openvz/kernel/fairsched.c --- linux-2.6.32.48/kernel/fairsched.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/fairsched.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,683 @@ +/* + * Fair Scheduler + * + * Copyright (C) 2000-2008 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include + +struct fairsched_node fairsched_init_node = { + .id = FAIRSCHED_INIT_NODE_ID, + .tg = &init_task_group, +#ifdef CONFIG_VE + .owner_env = get_ve0(), +#endif + .weight = 1, +}; + +static DEFINE_MUTEX(fairsched_mutex); + +/* list protected with fairsched_mutex */ +static LIST_HEAD(fairsched_node_head); +static int fairsched_nr_nodes; + +void __init fairsched_init_early(void) +{ + list_add(&fairsched_init_node.nodelist, &fairsched_node_head); + fairsched_nr_nodes++; +} + +#define FSCHWEIGHT_BASE 512000 + +/****************************************************************************** + * cfs group shares = FSCHWEIGHT_BASE / fairsched weight + * + * vzctl cpuunits default 1000 + * cfs shares default value is 1024 (see init_task_group_load in sched.c) + * cpuunits = 1000 --> weight = 500000 / cpuunits = 500 --> shares = 1024 + * ^--- from vzctl + * weight in 1..65535 --> shares in 7..512000 + * shares should be >1 (see comment in sched_group_set_shares function) + *****************************************************************************/ + +static struct fairsched_node *fairsched_find(unsigned int id) +{ + struct fairsched_node *p; + list_for_each_entry(p, &fairsched_node_head, nodelist) { + if (p->id == id) + return p; + } + return NULL; +} + +/****************************************************************************** + * System calls + * + * All do_xxx functions are called under fairsched mutex and after + * capability check. + * + * The binary interfaces follow some other Fair Scheduler implementations + * (although some system call arguments are not needed for our implementation). + *****************************************************************************/ + +static int do_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + struct fairsched_node *node; + int retval; + + retval = -EINVAL; + if (weight < 1 || weight > FSCHWEIGHT_MAX) + goto out; + if (newid < 0 || newid > INT_MAX) + goto out; + + retval = -EBUSY; + if (fairsched_find(newid) != NULL) + goto out; + + retval = -ENOMEM; + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (node == NULL) + goto out; + + node->tg = sched_create_group(&init_task_group); + if (IS_ERR(node->tg)) + goto out_free; + + node->id = newid; + node->weight = weight; + sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight); +#ifdef CONFIG_VE + node->owner_env = get_exec_env(); +#endif + list_add(&node->nodelist, &fairsched_node_head); + fairsched_nr_nodes++; + + retval = newid; +out: + return retval; + +out_free: + kfree(node); + return retval; +} + +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + int retval; + + if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_mknod(parent, weight, newid); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_mknod); + +static int do_fairsched_rmnod(unsigned int id) +{ + struct fairsched_node *node; + int retval; + + retval = -EINVAL; + node = fairsched_find(id); + if (node == NULL) + goto out; + if (node == &fairsched_init_node) + goto out; + + retval = -EBUSY; + if (node->refcnt) + goto out; + + list_del(&node->nodelist); + fairsched_nr_nodes--; + + sched_destroy_group(node->tg); + kfree(node); + retval = 0; +out: + return retval; +} + +asmlinkage int sys_fairsched_rmnod(unsigned int id) +{ + int retval; + + if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_rmnod(id); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_rmnod); + +static int do_fairsched_chwt(unsigned int id, unsigned weight) +{ + struct fairsched_node *node; + + if (id == 0) + return -EINVAL; + if (weight < 1 || weight > FSCHWEIGHT_MAX) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + node->weight = weight; + sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight); + + return 0; +} + +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight) +{ + int retval; + + if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_chwt(id, weight); + mutex_unlock(&fairsched_mutex); + + return retval; +} + +static int do_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + struct fairsched_node *node; + + if (id == 0) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + return 0; +} + +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + int retval; + + if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_vcpus(id, vcpus); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_vcpus); + +static int do_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + struct fairsched_node *node; + int retval; + + if (id == 0) + return -EINVAL; + if (op == FAIRSCHED_SET_RATE && (rate < 1 || rate >= (1UL << 31))) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + retval = -EINVAL; + switch (op) { + case FAIRSCHED_SET_RATE: + node->rate = rate; + node->rate_limited = 1; + retval = rate; + break; + case FAIRSCHED_DROP_RATE: + node->rate = 0; + node->rate_limited = 0; + retval = 0; + break; + case FAIRSCHED_GET_RATE: + if (node->rate_limited) + retval = node->rate; + else + retval = -ENODATA; + break; + } + return retval; +} + +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + int retval; + + if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_rate(id, op, rate); + mutex_unlock(&fairsched_mutex); + + return retval; +} + +static int do_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + struct task_struct *p; + struct fairsched_node *node; + int retval; + + retval = -ENOENT; + node = fairsched_find(nodeid); + if (node == NULL) + goto out; + + write_lock_irq(&tasklist_lock); + retval = -ESRCH; + p = find_task_by_vpid(pid); + if (p == NULL) + goto out_unlock; + + get_task_struct(p); + put_task_fairsched_node(p); + p->fsched_node = node; + get_task_fairsched_node(p); + write_unlock_irq(&tasklist_lock); + + smp_wmb(); + sched_move_task(p); + put_task_struct(p); + return 0; + +out_unlock: + write_unlock_irq(&tasklist_lock); +out: + return retval; +} + +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + int retval; + + if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_mvpr(pid, nodeid); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_mvpr); + +int fairsched_new_node(int id, unsigned int vcpus) +{ + int err; + + mutex_lock(&fairsched_mutex); + /* + * We refuse to switch to an already existing node since nodes + * keep a pointer to their ve_struct... + */ + err = do_fairsched_mknod(0, 1, id); + if (err < 0) { + printk(KERN_WARNING "Can't create fairsched node %d\n", id); + goto out; + } +#if 0 + err = do_fairsched_vcpus(id, vcpus); + if (err) { + printk(KERN_WARNING "Can't set sched vcpus on node %d\n", id); + goto cleanup; + } +#endif + err = do_fairsched_mvpr(current->pid, id); + if (err) { + printk(KERN_WARNING "Can't switch to fairsched node %d\n", id); + goto cleanup; + } + mutex_unlock(&fairsched_mutex); + return 0; + +cleanup: + if (do_fairsched_rmnod(id)) + printk(KERN_ERR "Can't clean fairsched node %d\n", id); +out: + mutex_unlock(&fairsched_mutex); + return err; +} +EXPORT_SYMBOL(fairsched_new_node); + +void fairsched_drop_node(int id) +{ + mutex_lock(&fairsched_mutex); + if (task_fairsched_node_id(current) == id) + if (do_fairsched_mvpr(current->pid, FAIRSCHED_INIT_NODE_ID)) + printk(KERN_WARNING "Can't leave sched node %d\n", id); + if (do_fairsched_rmnod(id)) + printk(KERN_ERR "Can't remove fairsched node %d\n", id); + mutex_unlock(&fairsched_mutex); +} +EXPORT_SYMBOL(fairsched_drop_node); + +#ifdef CONFIG_PROC_FS + +/*********************************************************************/ +/* + * proc interface + */ +/*********************************************************************/ + +#include +#include +#include + +struct fairsched_node_dump { + int id; + unsigned weight; + unsigned rate; + int rate_limited; + int nr_pcpu; + int nr_tasks, nr_runtasks; +}; + +struct fairsched_dump { + int len; + struct fairsched_node_dump nodes[0]; +}; + +static struct fairsched_dump *fairsched_do_dump(int compat) +{ + int nr_nodes; + int len; + struct fairsched_dump *dump; + struct fairsched_node *node; + struct fairsched_node_dump *p; + + mutex_lock(&fairsched_mutex); + nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1); + len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]); + dump = ub_vmalloc(len); + if (dump == NULL) + goto out; + + p = dump->nodes; + list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) { + if ((char *)p - (char *)dump >= len) + break; + p->nr_tasks = 0; + p->nr_runtasks = 0; +#ifdef CONFIG_VE + if (!ve_accessible(node->owner_env, get_exec_env())) + continue; + p->nr_tasks = atomic_read(&node->owner_env->pcounter); + p->nr_runtasks = nr_running_ve(node->owner_env); +#endif + p->id = node->id; + p->weight = node->weight; + p->rate = node->rate; + p->rate_limited = node->rate_limited; + p->nr_pcpu = num_online_cpus(); + p++; + } + dump->len = p - dump->nodes; +out: + mutex_unlock(&fairsched_mutex); + return dump; +} + +#define FAIRSCHED_PROC_HEADLINES 2 + +#define FAIRSHED_DEBUG " debug" + +#ifdef CONFIG_VE +/* + * File format is dictated by compatibility reasons. + */ +static int fairsched_seq_show(struct seq_file *m, void *v) +{ + struct fairsched_dump *dump; + struct fairsched_node_dump *p; + unsigned vid, nid, pid, r; + + dump = m->private; + p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL); + if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { + if (p == dump->nodes) + seq_printf(m, "Version: 2.6 debug\n"); + else if (p == dump->nodes + 1) + seq_printf(m, + " veid " + " id " + " parent " + "weight " + " rate " + "tasks " + " run " + "cpus" + " " + "flg " + "ready " + " start_tag " + " value " + " delay" + "\n"); + } else { + p -= FAIRSCHED_PROC_HEADLINES; + vid = nid = pid = 0; + r = (unsigned long)v & 3; + if (p == dump->nodes) { + if (r == 2) + nid = p->id; + } else { + if (!r) + nid = p->id; + else if (r == 1) + vid = pid = p->id; + else + vid = p->id, nid = 1; + } + seq_printf(m, + "%10u " + "%10u %10u %6u %5u %5u %5u %4u" + " " + " %c%c %5u %20Lu %20Lu %20Lu" + "\n", + vid, + nid, + pid, + p->weight, + p->rate, + p->nr_tasks, + p->nr_runtasks, + p->nr_pcpu, + p->rate_limited ? 'L' : '.', + '.', + p->nr_runtasks, + 0ll, 0ll, 0ll); + } + + return 0; +} + +static void *fairsched_seq_start(struct seq_file *m, loff_t *pos) +{ + struct fairsched_dump *dump; + unsigned long l; + + dump = m->private; + if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES) + return NULL; + if (*pos < FAIRSCHED_PROC_HEADLINES) + return dump->nodes + *pos; + /* guess why... */ + l = (unsigned long)(dump->nodes + + ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3); + l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3; + return (void *)l; +} +static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return fairsched_seq_start(m, pos); +} +#endif /* CONFIG_VE */ + +static int fairsched2_seq_show(struct seq_file *m, void *v) +{ + struct fairsched_dump *dump; + struct fairsched_node_dump *p; + + dump = m->private; + p = v; + if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { + if (p == dump->nodes) + seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n"); + else if (p == dump->nodes + 1) + seq_printf(m, + " id " + "weight " + " rate " + " run " + "cpus" +#ifdef FAIRSHED_DEBUG + " " + "flg " + "ready " + " start_tag " + " value " + " delay" +#endif + "\n"); + } else { + p -= FAIRSCHED_PROC_HEADLINES; + seq_printf(m, + "%10u %6u %5u %5u %4u" +#ifdef FAIRSHED_DEBUG + " " + " %c%c %5u %20Lu %20Lu %20Lu" +#endif + "\n", + p->id, + p->weight, + p->rate, + p->nr_runtasks, + p->nr_pcpu +#ifdef FAIRSHED_DEBUG + , + p->rate_limited ? 'L' : '.', + '.', + p->nr_runtasks, + 0ll, 0ll, 0ll +#endif + ); + } + + return 0; +} + +static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos) +{ + struct fairsched_dump *dump; + + dump = m->private; + if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES) + return NULL; + return dump->nodes + *pos; +} +static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return fairsched2_seq_start(m, pos); +} +static void fairsched2_seq_stop(struct seq_file *m, void *v) +{ +} + +#ifdef CONFIG_VE +static struct seq_operations fairsched_seq_op = { + .start = fairsched_seq_start, + .next = fairsched_seq_next, + .stop = fairsched2_seq_stop, + .show = fairsched_seq_show +}; +#endif +static struct seq_operations fairsched2_seq_op = { + .start = fairsched2_seq_start, + .next = fairsched2_seq_next, + .stop = fairsched2_seq_stop, + .show = fairsched2_seq_show +}; +static int fairsched_seq_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + int compat; + +#ifdef CONFIG_VE + compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1); + ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op); +#else + compat = 0; + ret = seq_open(file, &fairsched2_seq_op); +#endif + if (ret) + return ret; + m = file->private_data; + m->private = fairsched_do_dump(compat); + if (m->private == NULL) { + seq_release(inode, file); + ret = -ENOMEM; + } + return ret; +} +static int fairsched_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *m; + struct fairsched_dump *dump; + + m = file->private_data; + dump = m->private; + m->private = NULL; + vfree(dump); + seq_release(inode, file); + return 0; +} +static struct file_operations proc_fairsched_operations = { + .open = fairsched_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = fairsched_seq_release +}; + +void __init fairsched_init_late(void) +{ + proc_create("fairsched", S_IRUGO, &glob_proc_root, + &proc_fairsched_operations); + proc_create("fairsched2", S_IRUGO, &glob_proc_root, + &proc_fairsched_operations); +} + +#else + +void __init fairsched_init_late(void) { } + +#endif /* CONFIG_PROC_FS */ diff -urNp linux-2.6.32.48/kernel/fork.c linux-2.6.32.48-openvz/kernel/fork.c --- linux-2.6.32.48/kernel/fork.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/fork.c 2011-11-21 17:40:47.000000000 -0500 @@ -64,6 +64,8 @@ #include #include #include +#include +#include #include #include @@ -72,6 +74,10 @@ #include #include +#include +#include +#include + #include /* @@ -79,12 +85,14 @@ */ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ +EXPORT_SYMBOL_GPL(nr_threads); int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL(tasklist_lock); int nr_processes(void) { @@ -162,12 +170,18 @@ void __put_task_struct(struct task_struc WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + ub_task_put(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); +#ifdef CONFIG_VE + put_ve(VE_TASK_INFO(tsk)->owner_env); + atomic_dec(&nr_dead); +#endif if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); /* * macro override instead of weak attribute alias, to workaround @@ -186,7 +200,7 @@ void __init fork_init(unsigned long memp /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK | SLAB_UBC, NULL); #endif /* do the arch specific task caches init */ @@ -317,6 +331,10 @@ static int dup_mmap(struct mm_struct *mm continue; } charge = 0; + if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start, + mpnt->vm_flags & ~VM_LOCKED, + mpnt->vm_file, UB_HARD)) + goto fail_noch; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) @@ -376,7 +394,7 @@ static int dup_mmap(struct mm_struct *mm rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -395,6 +413,9 @@ out: fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: + ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start, + mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file); +fail_noch: retval = -ENOMEM; vm_unacct_memory(charge); goto out; @@ -462,6 +483,15 @@ static struct mm_struct * mm_init(struct mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); + /* + * This looks ugly, buy when we came from + * sys_execve -> mm_alloc -> here + * we need to get exec_ub, not task_ub. But when + * we're here like this + * sys_fork() -> dup_mm -> here + * we need task_ub, not the exec one... xemul + */ + set_mm_ub(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -469,6 +499,7 @@ static struct mm_struct * mm_init(struct return mm; } + put_mm_ub(mm); free_mm(mm); return NULL; } @@ -487,6 +518,7 @@ struct mm_struct * mm_alloc(void) } return mm; } +EXPORT_SYMBOL_GPL(mm_alloc); /* * Called when the last reference to the mm @@ -499,6 +531,7 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); + put_mm_ub(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -523,6 +556,9 @@ void mmput(struct mm_struct *mm) put_swap_token(mm); if (mm->binfmt) module_put(mm->binfmt->module); + (void) virtinfo_gencall(VIRTINFO_EXITMMAP, mm); + if (mm->oom_killed) + ub_oom_task_dead(current); mmdrop(mm); } } @@ -573,18 +609,20 @@ void mm_release(struct task_struct *tsk, /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) { - exit_robust_list(tsk); - tsk->robust_list = NULL; - } + if (!(tsk->flags & PF_EXIT_RESTART)) { + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); + tsk->robust_list = NULL; + } #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) { - compat_exit_robust_list(tsk); - tsk->compat_robust_list = NULL; - } + if (unlikely(tsk->compat_robust_list)) { + compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } #endif - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + } #endif /* Get rid of any cached register state */ @@ -673,6 +711,7 @@ fail_nocontext: * because it calls destroy_context() */ mm_free_pgd(mm); + put_mm_ub(mm); free_mm(mm); return NULL; } @@ -981,6 +1020,7 @@ static struct task_struct *copy_process( unsigned long stack_size, int __user *child_tidptr, struct pid *pid, + pid_t vpid, int trace) { int retval; @@ -1028,6 +1068,9 @@ static struct task_struct *copy_process( rt_mutex_init_task(p); + if (ub_task_charge(current, p)) + goto bad_fork_charge; + #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); @@ -1151,7 +1194,7 @@ static struct task_struct *copy_process( goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; - if ((retval = copy_namespaces(clone_flags, p))) + if ((retval = copy_namespaces(clone_flags, p, 0))) goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; @@ -1161,7 +1204,7 @@ static struct task_struct *copy_process( if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(p->nsproxy->pid_ns); + pid = alloc_pid(p->nsproxy->pid_ns, vpid); if (!pid) goto bad_fork_cleanup_io; @@ -1169,6 +1212,8 @@ static struct task_struct *copy_process( retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); if (retval < 0) goto bad_fork_free_pid; + if (task_active_pid_ns(current)->flags & PID_NS_HIDE_CHILD) + task_active_pid_ns(p)->flags |= PID_NS_HIDDEN; } } @@ -1253,7 +1298,7 @@ static struct task_struct *copy_process( * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); - if (signal_pending(current)) { + if (signal_pending(current) && !vpid) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; @@ -1281,14 +1326,24 @@ static struct task_struct *copy_process( attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); +#ifdef CONFIG_VE + list_add_tail_rcu(&p->ve_task_info.vetask_list, + &p->ve_task_info.owner_env->vetask_lh); +#endif __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } + (void)get_ve(p->ve_task_info.owner_env); + pget_ve(p->ve_task_info.owner_env); +#ifdef CONFIG_VE + seqcount_init(&p->ve_task_info.wakeup_lock); +#endif total_forks++; spin_unlock(¤t->sighand->siglock); + get_task_fairsched_node(p); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); @@ -1331,6 +1386,9 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + ub_task_uncharge(p); + ub_task_put(p); +bad_fork_charge: free_task(p); fork_out: return ERR_PTR(retval); @@ -1348,7 +1406,7 @@ struct task_struct * __cpuinit fork_idle struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid, 0); + &init_struct_pid, 0, 0); if (!IS_ERR(task)) init_idle(task, cpu); @@ -1361,12 +1419,13 @@ struct task_struct * __cpuinit fork_idle * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long do_fork(unsigned long clone_flags, +long do_fork_pid(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr) + int __user *child_tidptr, + long vpid) { struct task_struct *p; int trace = 0; @@ -1404,6 +1463,10 @@ long do_fork(unsigned long clone_flags, } } + nr = virtinfo_gencall(VIRTINFO_DOFORK, (void *)clone_flags); + if (nr) + return nr; + /* * When called from kernel_thread, don't do user tracing stuff. */ @@ -1411,7 +1474,7 @@ long do_fork(unsigned long clone_flags, trace = tracehook_prepare_clone(clone_flags); p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL, trace); + child_tidptr, NULL, vpid, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1442,6 +1505,8 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; + (void)virtinfo_gencall(VIRTINFO_DOFORKRET, p); + if (unlikely(clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. @@ -1465,6 +1530,8 @@ long do_fork(unsigned long clone_flags, } else { nr = PTR_ERR(p); } + + (void)virtinfo_gencall(VIRTINFO_DOFORKPOST, (void *)(long)nr); return nr; } @@ -1480,25 +1547,38 @@ static void sighand_ctor(void *data) init_waitqueue_head(&sighand->signalfd_wqh); } +EXPORT_SYMBOL(do_fork_pid); + +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + return do_fork_pid(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr, 0); +} + void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); + SLAB_NOTRACK|SLAB_UBC, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_UBC); mmap_init(); } diff -urNp linux-2.6.32.48/kernel/freezer.c linux-2.6.32.48-openvz/kernel/freezer.c --- linux-2.6.32.48/kernel/freezer.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/freezer.c 2011-11-21 17:40:47.000000000 -0500 @@ -29,6 +29,28 @@ void refrigerator(void) processes around? */ long save; +#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE) + save = current->state; + current->state = TASK_UNINTERRUPTIBLE; + + spin_lock_irq(¤t->sighand->siglock); + if (test_and_clear_thread_flag(TIF_FREEZE)) { + recalc_sigpending(); /* We sent fake signal, clean it up */ + if (atomic_read(&global_suspend) || + atomic_read(&get_exec_env()->suspend)) + current->flags |= PF_FROZEN; + else + current->state = save; + } else { + /* Freeze request could be canceled before we entered + * refrigerator(). In this case we do nothing. */ + current->state = save; + } + spin_unlock_irq(¤t->sighand->siglock); + + while (current->flags & PF_FROZEN) + schedule(); +#else task_lock(current); if (freezing(current)) { frozen_process(); @@ -57,6 +79,7 @@ void refrigerator(void) /* Remove the accounting blocker */ current->flags &= ~PF_FREEZING; +#endif pr_debug("%s left refrigerator\n", current->comm); __set_current_state(save); } diff -urNp linux-2.6.32.48/kernel/futex.c linux-2.6.32.48-openvz/kernel/futex.c --- linux-2.6.32.48/kernel/futex.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/futex.c 2011-11-21 17:40:47.000000000 -0500 @@ -1618,8 +1618,6 @@ handle_fault: #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 -static long futex_wait_restart(struct restart_block *restart); - /** * fixup_owner() - Post lock pi_state and corner case management * @uaddr: user address of the futex @@ -1893,7 +1891,7 @@ out: } -static long futex_wait_restart(struct restart_block *restart) +long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; int fshared = 0; @@ -1910,6 +1908,7 @@ static long futex_wait_restart(struct re restart->futex.bitset, restart->futex.flags & FLAGS_CLOCKRT); } +EXPORT_SYMBOL_GPL(futex_wait_restart); /* diff -urNp linux-2.6.32.48/kernel/hrtimer.c linux-2.6.32.48-openvz/kernel/hrtimer.c --- linux-2.6.32.48/kernel/hrtimer.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/hrtimer.c 2011-11-21 17:40:47.000000000 -0500 @@ -1554,6 +1554,7 @@ out: destroy_hrtimer_on_stack(&t.timer); return ret; } +EXPORT_SYMBOL_GPL(hrtimer_nanosleep_restart); long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) diff -urNp linux-2.6.32.48/kernel/hung_task.c linux-2.6.32.48-openvz/kernel/hung_task.c --- linux-2.6.32.48/kernel/hung_task.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/hung_task.c 2011-11-21 17:40:47.000000000 -0500 @@ -143,7 +143,7 @@ static void check_hung_uninterruptible_t return; rcu_read_lock(); - do_each_thread(g, t) { + do_each_thread_all(g, t) { if (!--max_count) goto unlock; if (!--batch_count) { @@ -156,7 +156,7 @@ static void check_hung_uninterruptible_t /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); - } while_each_thread(g, t); + } while_each_thread_all(g, t); unlock: rcu_read_unlock(); } diff -urNp linux-2.6.32.48/kernel/Kconfig.openvz linux-2.6.32.48-openvz/kernel/Kconfig.openvz --- linux-2.6.32.48/kernel/Kconfig.openvz 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/Kconfig.openvz 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,103 @@ +# Copyright (C) 2005 SWsoft +# All rights reserved. +# Licensing governed by "linux/COPYING.SWsoft" file. + +menu "OpenVZ" + +config VE + bool "Virtual Environment support" + default y + select NAMESPACES + select PID_NS + select IPC_NS + select UTS_NS + select NET_NS + select USER_NS + select CGROUPS + select CGROUP_DEVICE + select GROUP_SCHED + select FAIR_GROUP_SCHED + help + This option adds support of virtual Linux running on the original box + with fully supported virtual network driver, tty subsystem and + configurable access for hardware and other resources. + +config VE_CALLS + tristate "VE calls interface" + depends on VE + select VZ_DEV + default m + help + This option controls how to build vzmon code containing VE calls. + By default it's build in module vzmon.o + +config VZ_GENCALLS + bool + default y + +config VE_NETDEV + tristate "VE network device" + depends on VE_CALLS && NET + select VZ_DEV + default m + help + This option controls whether to build venet device. This is a + common interface for networking in VE. + +config VE_ETHDEV + tristate "Virtual ethernet device" + depends on VE_CALLS && NET + select VZ_DEV + default m + help + This option controls whether to build virtual ethernet device. + +config VZ_DEV + tristate "VE device" + default m + help + This option adds support of vzdev device, which is used by + user-space applications to control Virtual Environments. + +config VE_IPTABLES + bool "VE netfiltering" + depends on VE && VE_NETDEV && INET && NETFILTER + default y + help + This option controls whether to build VE netfiltering code. + +config VZ_WDOG + tristate "VE watchdog module" + depends on VE_CALLS + default m + help + This option controls building of vzwdog module, which dumps + a lot of useful system info on console periodically. + +config VZ_CHECKPOINT + tristate "Checkpointing & restoring Virtual Environments" + depends on X86 || IA64 + depends on VE_CALLS + select PM + select PM_SLEEP + select TUN + select VE_ETHDEV + select VE_NETDEV + default m + help + This option adds two modules, "cpt" and "rst", which allow + to save a running Virtual Environment and restore it + on another host (live migration) or on the same host (checkpointing). + +config VZ_EVENT + tristate "Enable sending notifications of the VE status change through the netlink socket" + depends on VE && VE_CALLS && NET + default m + help + This option provides for sending notifications of the VE + events to the curious user space applications through + the netlink socket just like the core kernel + networking code does. By now just the notifications of + the VE essensial status changes are being sent. + +endmenu diff -urNp linux-2.6.32.48/kernel/kgdb.c linux-2.6.32.48-openvz/kernel/kgdb.c --- linux-2.6.32.48/kernel/kgdb.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/kgdb.c 2011-11-21 17:40:47.000000000 -0500 @@ -1019,7 +1019,7 @@ static void gdb_cmd_query(struct kgdb_st } } - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); pack_threadid(ptr, thref); @@ -1030,7 +1030,7 @@ static void gdb_cmd_query(struct kgdb_st finished = 1; } i++; - } while_each_thread(g, p); + } while_each_thread_all(g, p); *(--ptr) = '\0'; break; diff -urNp linux-2.6.32.48/kernel/kmod.c linux-2.6.32.48-openvz/kernel/kmod.c --- linux-2.6.32.48/kernel/kmod.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/kmod.c 2011-11-21 17:40:47.000000000 -0500 @@ -80,6 +80,10 @@ int __request_module(bool wait, const ch #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + /* Don't allow request_module() inside VE. */ + if (!ve_is_super(get_exec_env())) + return -EPERM; + ret = security_kernel_module_request(); if (ret) return ret; @@ -471,6 +475,9 @@ int call_usermodehelper_exec(struct subp DECLARE_COMPLETION_ONSTACK(done); int retval = 0; + if (!ve_is_super(get_exec_env())) + return -EPERM; + BUG_ON(atomic_read(&sub_info->cred->usage) != 1); validate_creds(sub_info->cred); diff -urNp linux-2.6.32.48/kernel/kprobes.c linux-2.6.32.48-openvz/kernel/kprobes.c --- linux-2.6.32.48/kernel/kprobes.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/kprobes.c 2011-11-21 17:40:47.000000000 -0500 @@ -128,14 +128,14 @@ static int __kprobes check_safety(void) ret = freeze_processes(); if (ret == 0) { struct task_struct *p, *q; - do_each_thread(p, q) { + do_each_thread_all(p, q) { if (p != current && p->state == TASK_RUNNING && p->pid != 0) { printk("Check failed: %s is running\n",p->comm); ret = -1; goto loop_end; } - } while_each_thread(p, q); + } while_each_thread_all(p, q); } loop_end: thaw_processes(); diff -urNp linux-2.6.32.48/kernel/kthread.c linux-2.6.32.48-openvz/kernel/kthread.c --- linux-2.6.32.48/kernel/kthread.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/kthread.c 2011-11-21 17:40:47.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -25,6 +26,7 @@ struct kthread_create_info /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; + struct ve_struct *ve; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; @@ -67,6 +69,16 @@ static int kthread(void *_create) init_completion(&self.exited); current->vfork_done = &self.exited; + if (do_ve_enter_hook && create->ve != get_ve0()) { + ret = do_ve_enter_hook(create->ve, 0); + if (ret < 0) { + create->result = ERR_PTR(ret); + complete(&create->done); + goto out; + } + } else if (create->ve != get_ve0()) + BUG(); + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; @@ -76,7 +88,7 @@ static int kthread(void *_create) ret = -EINTR; if (!self.should_stop) ret = threadfn(data); - +out: /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } @@ -94,7 +106,7 @@ static void create_kthread(struct kthrea } /** - * kthread_create - create a kthread. + * kthread_create_ve - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @namefmt: printf-style name for the thread. @@ -112,7 +124,8 @@ static void create_kthread(struct kthrea * * Returns a task_struct or ERR_PTR(-ENOMEM). */ -struct task_struct *kthread_create(int (*threadfn)(void *data), +struct task_struct *kthread_create_ve(struct ve_struct *ve, + int (*threadfn)(void *data), void *data, const char namefmt[], ...) @@ -121,6 +134,7 @@ struct task_struct *kthread_create(int ( create.threadfn = threadfn; create.data = data; + create.ve = ve; init_completion(&create.done); spin_lock(&kthread_create_lock); @@ -147,7 +161,7 @@ struct task_struct *kthread_create(int ( } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(kthread_create_ve); /** * kthread_stop - stop a thread created by kthread_create(). diff -urNp linux-2.6.32.48/kernel/lockdep.c linux-2.6.32.48-openvz/kernel/lockdep.c --- linux-2.6.32.48/kernel/lockdep.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/lockdep.c 2011-11-21 17:40:47.000000000 -0500 @@ -3742,7 +3742,7 @@ retry: printk(KERN_CONT " locked it.\n"); } - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * It's not reliable to print a task's held locks * if it's not sleeping (or if it's not the current @@ -3755,7 +3755,7 @@ retry: if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; - } while_each_thread(g, p); + } while_each_thread_all(g, p); printk("\n"); printk("=============================================\n\n"); diff -urNp linux-2.6.32.48/kernel/Makefile linux-2.6.32.48-openvz/kernel/Makefile --- linux-2.6.32.48/kernel/Makefile 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/Makefile 2011-11-21 17:40:47.000000000 -0500 @@ -28,6 +28,10 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +obj-$(CONFIG_BEANCOUNTERS) += bc/ +obj-y += ve/ +obj-$(CONFIG_VZ_CHECKPOINT) += cpt/ + obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) @@ -57,7 +61,11 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o +ifeq ($(CONFIG_VE),n) obj-$(CONFIG_CGROUPS) += cgroup.o +else +obj-$(CONFIG_CGROUPS) += cgroup_lite.o +endif obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o @@ -88,6 +96,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayac obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_VZ_FAIRSCHED) += fairsched.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ diff -urNp linux-2.6.32.48/kernel/module.c linux-2.6.32.48-openvz/kernel/module.c --- linux-2.6.32.48/kernel/module.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/module.c 2011-11-21 17:40:47.000000000 -0500 @@ -2917,6 +2917,8 @@ static char *module_flags(struct module static void *m_start(struct seq_file *m, loff_t *pos) { mutex_lock(&module_mutex); + if (!ve_is_super(get_exec_env())) + return NULL; return seq_list_start(&modules, *pos); } @@ -2981,7 +2983,7 @@ static const struct file_operations proc static int __init proc_modules_init(void) { - proc_create("modules", 0, NULL, &proc_modules_operations); + proc_create("modules", 0, &glob_proc_root, &proc_modules_operations); return 0; } module_init(proc_modules_init); diff -urNp linux-2.6.32.48/kernel/nsproxy.c linux-2.6.32.48-openvz/kernel/nsproxy.c --- linux-2.6.32.48/kernel/nsproxy.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/nsproxy.c 2011-11-21 17:40:47.000000000 -0500 @@ -26,6 +26,14 @@ static struct kmem_cache *nsproxy_cachep struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + static inline struct nsproxy *create_nsproxy(void) { struct nsproxy *nsproxy; @@ -69,7 +77,7 @@ static struct nsproxy *create_new_namesp goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; @@ -104,7 +112,8 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(unsigned long flags, struct task_struct *tsk, + int force_admin) { struct nsproxy *old_ns = tsk->nsproxy; struct nsproxy *new_ns; @@ -119,9 +128,20 @@ int copy_namespaces(unsigned long flags, CLONE_NEWPID | CLONE_NEWNET))) return 0; - if (!capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; + if (!force_admin) { + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + /* + * netns-vs-sysfs is deadly broken, thus new namespace + * (even in ve0) can bring the node down + */ + if (flags & CLONE_NEWNET) { + err = -EINVAL; + goto out; + } } /* @@ -148,6 +168,7 @@ out: put_nsproxy(old_ns); return err; } +EXPORT_SYMBOL(copy_namespaces); void free_nsproxy(struct nsproxy *ns) { @@ -162,6 +183,22 @@ void free_nsproxy(struct nsproxy *ns) put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } +EXPORT_SYMBOL(free_nsproxy); + +struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk) +{ + struct mnt_namespace *mnt_ns = NULL; + + task_lock(tsk); + if (tsk->nsproxy) + mnt_ns = tsk->nsproxy->mnt_ns; + if (mnt_ns) + get_mnt_ns(mnt_ns); + task_unlock(tsk); + + return mnt_ns; +} +EXPORT_SYMBOL(get_task_mnt_ns); /* * Called from unshare. Unshare all the namespaces part of nsproxy. @@ -179,6 +216,9 @@ int unshare_nsproxy_namespaces(unsigned if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (unshare_flags & CLONE_NEWNET) + return -EINVAL; + *new_nsp = create_new_namespaces(unshare_flags, current, new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { diff -urNp linux-2.6.32.48/kernel/pid.c linux-2.6.32.48-openvz/kernel/pid.c --- linux-2.6.32.48/kernel/pid.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/pid.c 2011-11-21 17:40:47.000000000 -0500 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -110,7 +111,7 @@ EXPORT_SYMBOL(is_container_init); * For now it is easier to be safe than to prove it can't happen. */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); static void free_pidmap(struct upid *upid) { @@ -121,8 +122,9 @@ static void free_pidmap(struct upid *upi clear_bit(offset, map->page); atomic_inc(&map->nr_free); } +EXPORT_SYMBOL_GPL(free_pidmap); -static int alloc_pidmap(struct pid_namespace *pid_ns) +int alloc_pidmap(struct pid_namespace *pid_ns) { int i, offset, max_scan, pid, last = pid_ns->last_pid; struct pidmap *map; @@ -182,6 +184,36 @@ static int alloc_pidmap(struct pid_names return -1; } +int set_pidmap(struct pid_namespace *pid_ns, pid_t pid) +{ + int offset; + struct pidmap *map; + + offset = pid & BITS_PER_PAGE_MASK; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; + if (unlikely(!map->page)) { + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); + /* + * Free the page if someone raced with us + * installing it: + */ + spin_lock_irq(&pidmap_lock); + if (map->page) + kfree(page); + else + map->page = page; + spin_unlock_irq(&pidmap_lock); + if (unlikely(!map->page)) + return -ENOMEM; + } + + if (test_and_set_bit(offset, map->page)) + return -EBUSY; + + atomic_dec(&map->nr_free); + return pid; +} + int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) { int offset; @@ -230,25 +262,34 @@ void free_pid(struct pid *pid) /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; + struct upid *upid; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); - spin_unlock_irqrestore(&pidmap_lock, flags); + for (i = 0; i <= pid->level; i++) { + upid = &pid->numbers[i]; + if (!hlist_unhashed(&upid->pid_chain)) + hlist_del_rcu(&upid->pid_chain); + } + spin_unlock(&pidmap_lock); + ub_kmemsize_uncharge(pid->ub, + kmem_cache_objuse(pid->numbers[pid->level].ns->pid_cachep)); + local_irq_restore(flags); for (i = 0; i <= pid->level; i++) free_pidmap(pid->numbers + i); - + put_beancounter(pid->ub); call_rcu(&pid->rcu, delayed_put_pid); } +EXPORT_SYMBOL_GPL(free_pid); -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; + struct user_beancounter *ub; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) @@ -256,7 +297,10 @@ struct pid *alloc_pid(struct pid_namespa tmp = ns; for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); + if (vpid != 0 && i == ns->level) + nr = set_pidmap(tmp, vpid); + else + nr = alloc_pidmap(tmp); if (nr < 0) goto out_free; @@ -271,17 +315,32 @@ struct pid *alloc_pid(struct pid_namespa for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); +#ifdef CONFIG_BEANCOUNTERS + ub = get_exec_ub(); + local_irq_disable(); + if (ub_kmemsize_charge(ub, kmem_cache_objuse(ns->pid_cachep), UB_HARD)) + goto out_enable; + pid->ub = get_beancounter(ub); + spin_lock(&pidmap_lock); +#else spin_lock_irq(&pidmap_lock); +#endif for (i = ns->level; i >= 0; i--) { upid = &pid->numbers[i]; hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + if (upid->ns->flags & PID_NS_HIDDEN) + while (i--) + INIT_HLIST_NODE(&pid->numbers[i].pid_chain); } spin_unlock_irq(&pidmap_lock); out: return pid; +out_enable: + local_irq_enable(); + put_pid_ns(ns); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); @@ -290,6 +349,7 @@ out_free: pid = NULL; goto out; } +EXPORT_SYMBOL_GPL(alloc_pid); struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { @@ -312,6 +372,45 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); +void reattach_pid(struct task_struct *tsk, enum pid_type type, + struct pid *pid) +{ + int i; + struct pid *old_pid; + struct pid_link *link; + struct upid *upid; + + link = &tsk->pids[type]; + old_pid = link->pid; + + hlist_del_rcu(&link->node); + link->pid = pid; + hlist_add_head_rcu(&link->node, &pid->tasks[type]); + + if (type != PIDTYPE_PID) { + for (i = PIDTYPE_MAX; --i >= 0; ) + if (!hlist_empty(&old_pid->tasks[i])) + return; + + for (i = 0; i < pid->level; i++) + hlist_del_rcu(&old_pid->numbers[i].pid_chain); + } else { + for (i = PIDTYPE_MAX; --i >= 0; ) + if (!hlist_empty(&old_pid->tasks[i])) + BUG(); + + for (i = 0; i < pid->level; i++) + hlist_replace_rcu(&old_pid->numbers[i].pid_chain, + &pid->numbers[i].pid_chain); + + upid = &pid->numbers[pid->level]; + hlist_add_head_rcu(&upid->pid_chain, + &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + } + + call_rcu(&old_pid->rcu, delayed_put_pid); +} + /* * attach_pid() must be called with the tasklist_lock write-held. */ @@ -324,6 +423,7 @@ void attach_pid(struct task_struct *task link->pid = pid; hlist_add_head_rcu(&link->node, &pid->tasks[type]); } +EXPORT_SYMBOL_GPL(attach_pid); static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) @@ -344,6 +444,7 @@ static void __change_pid(struct task_str free_pid(pid); } +EXPORT_SYMBOL_GPL(detach_pid); void detach_pid(struct task_struct *task, enum pid_type type) { @@ -390,6 +491,7 @@ struct task_struct *find_task_by_vpid(pi { return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); } +EXPORT_SYMBOL(find_task_by_vpid); struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { @@ -425,6 +527,17 @@ struct pid *find_get_pid(pid_t nr) } EXPORT_SYMBOL_GPL(find_get_pid); +pid_t pid_to_vpid(pid_t nr) +{ + struct pid *pid; + + pid = find_pid_ns(nr, &init_pid_ns); + if (pid) + return pid->numbers[pid->level].nr; + return -1; +} +EXPORT_SYMBOL_GPL(pid_to_vpid); + pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) { struct upid *upid; diff -urNp linux-2.6.32.48/kernel/pid_namespace.c linux-2.6.32.48-openvz/kernel/pid_namespace.c --- linux-2.6.32.48/kernel/pid_namespace.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/pid_namespace.c 2011-11-21 17:40:47.000000000 -0500 @@ -13,6 +13,10 @@ #include #include #include +#include +#include + +#include #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -136,12 +140,167 @@ void free_pid_ns(struct kref *kref) put_pid_ns(parent); } +/* + * this is a dirty ugly hack. + */ + +static int __pid_ns_attach_task(struct pid_namespace *ns, + struct task_struct *tsk, pid_t nr) +{ + struct pid *pid; + enum pid_type type; + unsigned long old_size, new_size; + + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); + if (!pid) + goto out; + + if (nr == 0) + nr = alloc_pidmap(ns); + else + nr = set_pidmap(ns, nr); + + if (nr < 0) + goto out_free; + + memcpy(pid, task_pid(tsk), + sizeof(struct pid) + (ns->level - 1) * sizeof(struct upid)); + get_pid_ns(ns); + pid->level++; + BUG_ON(pid->level != ns->level); + pid->numbers[pid->level].nr = nr; + pid->numbers[pid->level].ns = ns; + atomic_set(&pid->count, 1); + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + + old_size = kmem_cache_objuse(pid->numbers[pid->level - 1].ns->pid_cachep); + new_size = kmem_cache_objuse(pid->numbers[pid->level].ns->pid_cachep); + local_irq_disable(); + /* + * Depending on sizeof(struct foo), cache flags (redzoning, etc) + * and actual CPU (cacheline_size() jump from 64 to 128 bytes after + * CPU detection) new size can very well be smaller than old size. + */ + if (new_size > old_size) { + if (ub_kmemsize_charge(pid->ub, new_size - old_size, UB_HARD) < 0) + goto out_enable; + } else + ub_kmemsize_uncharge(pid->ub, old_size - new_size); + + write_lock(&tasklist_lock); + + spin_lock(&pidmap_lock); + reattach_pid(tsk, PIDTYPE_SID, pid); + reattach_pid(tsk, PIDTYPE_PGID, pid); + tsk->signal->leader_pid = pid; + current->signal->tty_old_pgrp = NULL; + + reattach_pid(tsk, PIDTYPE_PID, pid); + spin_unlock(&pidmap_lock); + + write_unlock_irq(&tasklist_lock); + + return 0; + +out_enable: + local_irq_enable(); + put_pid_ns(ns); +out_free: + kmem_cache_free(ns->pid_cachep, pid); +out: + return -ENOMEM; +} + +int pid_ns_attach_task(struct pid_namespace *ns, struct task_struct *tsk) +{ + return __pid_ns_attach_task(ns, tsk, 0); +} +EXPORT_SYMBOL_GPL(pid_ns_attach_task); + +int pid_ns_attach_init(struct pid_namespace *ns, struct task_struct *tsk) +{ + int err; + + err = __pid_ns_attach_task(ns, tsk, 1); + if (err < 0) + return err; + + ns->child_reaper = tsk; + return 0; +} +EXPORT_SYMBOL_GPL(pid_ns_attach_init); + +#ifdef CONFIG_VE +static noinline void show_lost_task(struct task_struct *p) +{ + printk("Lost task: %d/%s/%p blocked: %lx pending: %lx\n", + p->pid, p->comm, p, + p->blocked.sig[0], + p->pending.signal.sig[0]); +} + +static void zap_ve_processes(struct ve_struct *env) +{ + /* wait for all init childs exit */ + while (atomic_read(&env->pcounter) > 1) { + struct task_struct *g, *p; + long delay = 1; + + if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0) + continue; + /* it was ENOCHLD or no more children somehow */ + if (atomic_read(&env->pcounter) == 1) + break; + + /* clear all signals to avoid wakeups */ + if (signal_pending(current)) + flush_signals(current); + /* we have child without signal sent */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + delay = (delay < HZ) ? (delay << 1) : HZ; + read_lock(&tasklist_lock); + do_each_thread_ve(g, p) { + if (p != current) { + /* + * by that time no processes other then entered + * may exist in the VE. if some were missed by + * zap_pid_ns_processes() this was a BUG + */ + if (!p->did_ve_enter) + show_lost_task(p); + + force_sig_specific(SIGKILL, p); + } + } while_each_thread_ve(g, p); + read_unlock(&tasklist_lock); + } +} +#endif + void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; int rc; struct task_struct *task; + struct ve_struct *env = get_exec_env(); + if (pid_ns == env->ve_ns->pid_ns) { + /* + * Here the VE changes its state into "not running". + * op_sem taken for write is a barrier to all VE manipulations from + * ioctl: it waits for operations currently in progress and blocks all + * subsequent operations until is_running is set to 0 and op_sem is + * released. + */ + + down_write(&env->op_sem); + env->is_running = 0; + up_write(&env->op_sem); + + ve_hook_iterate_fini(VE_INIT_EXIT_CHAIN, env); + } /* * The last thread in the cgroup-init thread group is terminating. * Find remaining pid_ts in the namespace, signal and wait for them @@ -181,6 +340,11 @@ void zap_pid_ns_processes(struct pid_nam } while (rc != -ECHILD); acct_exit_ns(pid_ns); + +#ifdef CONFIG_VE + if (pid_ns == env->ve_ns->pid_ns) + zap_ve_processes(env); +#endif return; } diff -urNp linux-2.6.32.48/kernel/posix-timers.c linux-2.6.32.48-openvz/kernel/posix-timers.c --- linux-2.6.32.48/kernel/posix-timers.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/posix-timers.c 2011-11-21 17:40:47.000000000 -0500 @@ -31,6 +31,8 @@ * POSIX clocks & timers */ #include +#include +#include #include #include #include @@ -46,6 +48,9 @@ #include #include #include +#include + +#include /* * Management arrays for POSIX timers. Timers are kept in slab memory @@ -303,8 +308,8 @@ static __init int init_posix_timers(void register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof (struct k_itimer), 0, + SLAB_PANIC|SLAB_UBC, NULL); idr_init(&posix_timers_id); return 0; } @@ -363,6 +368,7 @@ int posix_timer_event(struct k_itimer *t { struct task_struct *task; int shared, ret = -1; + /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -379,8 +385,17 @@ int posix_timer_event(struct k_itimer *t rcu_read_lock(); task = pid_task(timr->it_pid, PIDTYPE_PID); if (task) { + struct ve_struct *ve; + struct user_beancounter *ub; + + ve = set_exec_env(task->ve_task_info.owner_env); + ub = set_exec_ub(task->task_bc.task_ub); + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); ret = send_sigqueue(timr->sigq, task, shared); + + (void)set_exec_ub(ub); + (void)set_exec_env(ve); } rcu_read_unlock(); /* If we failed to send the signal the timer stops. */ diff -urNp linux-2.6.32.48/kernel/power/process.c linux-2.6.32.48-openvz/kernel/power/process.c --- linux-2.6.32.48/kernel/power/process.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/power/process.c 2011-11-21 17:40:47.000000000 -0500 @@ -15,6 +15,8 @@ #include #include +atomic_t global_suspend = ATOMIC_INIT(0); + /* * Timeout for stopping processes */ @@ -24,7 +26,9 @@ static inline int freezeable(struct task { if ((p == current) || (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) + (p->exit_state != 0) || + (p->state == TASK_STOPPED) || + (p->state == TASK_TRACED)) return 0; return 1; } @@ -44,7 +48,7 @@ static int try_to_freeze_tasks(bool sig_ do { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (frozen(p) || !freezeable(p)) continue; @@ -60,7 +64,7 @@ static int try_to_freeze_tasks(bool sig_ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ if (time_after(jiffies, end_time)) @@ -84,13 +88,13 @@ static int try_to_freeze_tasks(bool sig_ elapsed_csecs / 100, elapsed_csecs % 100, todo); show_state(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (freezing(p) && !freezer_should_skip(p)) printk(KERN_ERR " %s\n", p->comm); cancel_freezing(p); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } else { printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, @@ -107,6 +111,7 @@ int freeze_processes(void) { int error; + atomic_inc(&global_suspend); printk("Freezing user space processes ... "); error = try_to_freeze_tasks(true); if (error) @@ -123,6 +128,7 @@ int freeze_processes(void) Exit: BUG_ON(in_atomic()); printk("\n"); + atomic_dec(&global_suspend); return error; } @@ -132,7 +138,7 @@ static void thaw_tasks(bool nosig_only) struct task_struct *g, *p; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (!freezeable(p)) continue; @@ -142,8 +148,10 @@ static void thaw_tasks(bool nosig_only) if (cgroup_freezing_or_frozen(p)) continue; - thaw_process(p); - } while_each_thread(g, p); + if (!thaw_process(p)) + printk(KERN_WARNING " Strange, %s not stopped\n", + p->comm ); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } diff -urNp linux-2.6.32.48/kernel/printk.c linux-2.6.32.48-openvz/kernel/printk.c --- linux-2.6.32.48/kernel/printk.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/printk.c 2011-11-21 17:40:47.000000000 -0500 @@ -31,7 +31,9 @@ #include #include #include +#include #include +#include #include #include @@ -100,7 +102,7 @@ static int console_locked, console_suspe * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +DEFINE_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -136,6 +138,7 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +int console_silence_loglevel; #ifdef CONFIG_PRINTK @@ -162,6 +165,19 @@ void log_buf_kexec_setup(void) } #endif +static int __init setup_console_silencelevel(char *str) +{ + int level; + + if (get_option(&str, &level) != 1) + return 0; + + console_silence_loglevel = level; + return 1; +} + +__setup("silencelevel=", setup_console_silencelevel); + static int __init log_buf_len_setup(char *str) { unsigned size = memparse(str, &str); @@ -182,6 +198,9 @@ static int __init log_buf_len_setup(char spin_lock_irqsave(&logbuf_lock, flags); log_buf_len = size; log_buf = new_log_buf; +#ifdef CONFIG_VE + ve0.log_buf = log_buf; +#endif offset = start = min(con_start, log_start); dest_idx = 0; @@ -278,6 +297,9 @@ int do_syslog(int type, char __user *buf char c; int error = 0; + if (!ve_is_super(get_exec_env()) && (type == 6 || type == 7)) + goto out; + error = security_syslog(type); if (error) return error; @@ -298,15 +320,15 @@ int do_syslog(int type, char __user *buf error = -EFAULT; goto out; } - error = wait_event_interruptible(log_wait, - (log_start - log_end)); + error = wait_event_interruptible(ve_log_wait, + (ve_log_start - ve_log_end)); if (error) goto out; i = 0; spin_lock_irq(&logbuf_lock); - while (!error && (log_start != log_end) && i < len) { - c = LOG_BUF(log_start); - log_start++; + while (!error && (ve_log_start != ve_log_end) && i < len) { + c = VE_LOG_BUF(ve_log_start); + ve_log_start++; spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; @@ -332,15 +354,17 @@ int do_syslog(int type, char __user *buf error = -EFAULT; goto out; } + if (ve_log_buf == NULL) + goto out; count = len; - if (count > log_buf_len) - count = log_buf_len; spin_lock_irq(&logbuf_lock); - if (count > logged_chars) - count = logged_chars; + if (count > ve_log_buf_len) + count = ve_log_buf_len; + if (count > ve_logged_chars) + count = ve_logged_chars; if (do_clear) - logged_chars = 0; - limit = log_end; + ve_logged_chars = 0; + limit = ve_log_end; /* * __put_user() could sleep, and while we sleep * printk() could overwrite the messages @@ -349,9 +373,9 @@ int do_syslog(int type, char __user *buf */ for (i = 0; i < count && !error; i++) { j = limit-1-i; - if (j + log_buf_len < log_end) + if (j + ve_log_buf_len < ve_log_end) break; - c = LOG_BUF(j); + c = VE_LOG_BUF(j); spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); @@ -375,7 +399,7 @@ int do_syslog(int type, char __user *buf } break; case 5: /* Clear ring buffer */ - logged_chars = 0; + ve_logged_chars = 0; break; case 6: /* Disable logging to console */ if (saved_console_loglevel == -1) @@ -392,18 +416,21 @@ int do_syslog(int type, char __user *buf error = -EINVAL; if (len < 1 || len > 8) goto out; + error = 0; + /* VE has no console, so return success */ + if (!ve_is_super(get_exec_env())) + goto out; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = -1; - error = 0; break; case 9: /* Number of chars in the log buffer */ - error = log_end - log_start; + error = ve_log_end - ve_log_start; break; case 10: /* Size of the log buffer */ - error = log_buf_len; + error = ve_log_buf_len; break; default: error = -EINVAL; @@ -514,14 +541,14 @@ static void call_console_drivers(unsigne static void emit_log_char(char c) { - LOG_BUF(log_end) = c; - log_end++; - if (log_end - log_start > log_buf_len) - log_start = log_end - log_buf_len; - if (log_end - con_start > log_buf_len) - con_start = log_end - log_buf_len; - if (logged_chars < log_buf_len) - logged_chars++; + VE_LOG_BUF(ve_log_end) = c; + ve_log_end++; + if (ve_log_end - ve_log_start > ve_log_buf_len) + ve_log_start = ve_log_end - ve_log_buf_len; + if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len) + con_start = ve_log_end - ve_log_buf_len; + if (ve_logged_chars < ve_log_buf_len) + ve_logged_chars++; } /* @@ -586,6 +613,30 @@ static int have_callable_console(void) * See the vsnprintf() documentation for format string extensions over C99. */ +static inline int ve_log_init(void) +{ +#ifdef CONFIG_VE + if (ve_log_buf != NULL) + return 0; + + if (ve_is_super(get_exec_env())) { + ve0._log_wait = &log_wait; + ve0._log_start = &log_start; + ve0._log_end = &log_end; + ve0._logged_chars = &logged_chars; + ve0.log_buf = log_buf; + return 0; + } + + ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC); + if (!ve_log_buf) + return -ENOMEM; + + memset(ve_log_buf, 0, ve_log_buf_len); +#endif + return 0; +} + asmlinkage int printk(const char *fmt, ...) { va_list args; @@ -667,13 +718,14 @@ static inline void printk_delay(void) } } -asmlinkage int vprintk(const char *fmt, va_list args) +asmlinkage int __vprintk(const char *fmt, va_list args) { int printed_len = 0; int current_log_level = default_message_loglevel; unsigned long flags; int this_cpu; char *p; + int err, need_wake; boot_delay_msec(); printk_delay(); @@ -705,6 +757,13 @@ asmlinkage int vprintk(const char *fmt, spin_lock(&logbuf_lock); printk_cpu = this_cpu; + err = ve_log_init(); + if (err) { + spin_unlock(&logbuf_lock); + printed_len = err; + goto out_lockdep; + } + if (recursion_bug) { recursion_bug = 0; strcpy(printk_buf, recursion_bug_msg); @@ -788,19 +847,67 @@ asmlinkage int vprintk(const char *fmt, * will release 'logbuf_lock' regardless of whether it * actually gets the semaphore or not. */ - if (acquire_console_semaphore_for_printk(this_cpu)) + if (!ve_is_super(get_exec_env())) { + need_wake = (ve_log_start != ve_log_end); + printk_cpu = UINT_MAX; + spin_unlock(&logbuf_lock); + lockdep_on(); + raw_local_irq_restore(flags); + if (!oops_in_progress && need_wake) + wake_up_interruptible(&ve_log_wait); + goto out_preempt; + } else if (acquire_console_semaphore_for_printk(this_cpu)) release_console_sem(); +out_lockdep: lockdep_on(); out_restore_irqs: raw_local_irq_restore(flags); +out_preempt: preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); EXPORT_SYMBOL(vprintk); +asmlinkage int vprintk(const char *fmt, va_list args) +{ + int i; + struct ve_struct *env; + + env = set_exec_env(get_ve0()); + i = __vprintk(fmt, args); + (void)set_exec_env(env); + return i; +} + +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) +{ + int printed_len; + va_list args2; + + printed_len = 0; + va_copy(args2, args); + if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) + printed_len = vprintk(fmt, args); + if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) + printed_len = __vprintk(fmt, args2); + return printed_len; +} + +asmlinkage int ve_printk(int dst, const char *fmt, ...) +{ + va_list args; + int printed_len; + + va_start(args, fmt); + printed_len = ve_vprintk(dst, fmt, args); + va_end(args); + return printed_len; +} +EXPORT_SYMBOL(ve_printk); + #else static void call_console_drivers(unsigned start, unsigned end) @@ -1060,6 +1167,7 @@ void release_console_sem(void) _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ + printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); @@ -1068,6 +1176,7 @@ void release_console_sem(void) } console_locked = 0; up(&console_sem); + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) wake_up_klogd(); @@ -1384,6 +1493,36 @@ int printk_ratelimit(void) } EXPORT_SYMBOL(printk_ratelimit); +/* + * Rate limiting stuff. + */ +int vz_ratelimit(struct vz_rate_info *p) +{ + unsigned long cjif, djif; + unsigned long flags; + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + long new_bucket; + + spin_lock_irqsave(&ratelimit_lock, flags); + cjif = jiffies; + djif = cjif - p->last; + if (djif < p->interval) { + if (p->bucket >= p->burst) { + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; + } + p->bucket++; + } else { + new_bucket = p->bucket - (djif / (unsigned)p->interval); + if (new_bucket < 0) + new_bucket = 0; + p->bucket = new_bucket + 1; + } + p->last = cjif; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; +} + /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state @@ -1407,3 +1546,65 @@ bool printk_timed_ratelimit(unsigned lon } EXPORT_SYMBOL(printk_timed_ratelimit); #endif + +static cpumask_t nmi_show_regs_cpus = CPU_MASK_NONE; +static unsigned long nmi_show_regs_timeout; + +void __attribute__((weak)) send_nmi_ipi_allbutself(void) +{ + cpus_clear(nmi_show_regs_cpus); +} + +static void busted_show_regs(struct pt_regs *regs, int in_nmi) +{ + if (!regs || (in_nmi && spin_is_locked(&logbuf_lock))) + return; + + bust_spinlocks(1); + printk("----------- IPI show regs -----------\n"); + show_regs(regs); + bust_spinlocks(0); +} + +void nmi_show_regs(struct pt_regs *regs, int in_nmi) +{ + if (cpus_empty(nmi_show_regs_cpus)) + goto doit; + + /* Previous request still in progress */ + if (time_before(jiffies, nmi_show_regs_timeout)) + return; + + if (!in_nmi || !spin_is_locked(&logbuf_lock)) { + int cpu; + + bust_spinlocks(1); + printk("previous show regs lost IPI to: "); + for_each_cpu_mask(cpu, nmi_show_regs_cpus) + printk("%d ", cpu); + printk("\n"); + bust_spinlocks(0); + } + +doit: + nmi_show_regs_timeout = jiffies + HZ/10; + nmi_show_regs_cpus = cpu_online_map; + cpu_clear(raw_smp_processor_id(), nmi_show_regs_cpus); + busted_show_regs(regs, in_nmi); + send_nmi_ipi_allbutself(); +} + +/* call only from nmi handler */ +int do_nmi_show_regs(struct pt_regs *regs, int cpu) +{ + static DEFINE_SPINLOCK(nmi_show_regs_lock); + + if (!cpu_isset(cpu, nmi_show_regs_cpus)) + return 0; + + spin_lock(&nmi_show_regs_lock); + busted_show_regs(regs, 1); + cpu_clear(cpu, nmi_show_regs_cpus); + spin_unlock(&nmi_show_regs_lock); + return 1; +} diff -urNp linux-2.6.32.48/kernel/ptrace.c linux-2.6.32.48-openvz/kernel/ptrace.c --- linux-2.6.32.48/kernel/ptrace.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ptrace.c 2011-11-21 17:40:47.000000000 -0500 @@ -130,6 +130,8 @@ int __ptrace_may_access(struct task_stru * or halting the specified task is impossible. */ int dumpable = 0; + int vps_dumpable = 0; + /* Don't let security modules deny introspection */ if (task == current) return 0; @@ -147,11 +149,17 @@ int __ptrace_may_access(struct task_stru } rcu_read_unlock(); smp_rmb(); - if (task->mm) + if (task->mm) { dumpable = get_dumpable(task->mm); + vps_dumpable = (task->mm->vps_dumpable == 1); + } + if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - + if (!vps_dumpable && !ve_is_super(get_exec_env())) + return -EPERM; + if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env())) + return -EPERM; return security_ptrace_access_check(task, mode); } @@ -190,6 +198,9 @@ int ptrace_attach(struct task_struct *ta task_unlock(task); if (retval) goto unlock_creds; + retval = -EACCES; + if (task->mm->vps_dumpable == 2) + goto unlock_creds; write_lock_irq(&tasklist_lock); retval = -EPERM; @@ -396,6 +407,7 @@ int ptrace_writedata(struct task_struct } return copied; } +EXPORT_SYMBOL_GPL(access_process_vm); static int ptrace_setoptions(struct task_struct *child, long data) { @@ -584,6 +596,10 @@ static struct task_struct *ptrace_get_ta { struct task_struct *child; + /* ptracing of init from inside CT is dangerous */ + if (pid == 1 && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + rcu_read_lock(); child = find_task_by_vpid(pid); if (child) diff -urNp linux-2.6.32.48/kernel/sched.c linux-2.6.32.48-openvz/kernel/sched.c --- linux-2.6.32.48/kernel/sched.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/sched.c 2011-11-21 17:40:47.000000000 -0500 @@ -71,6 +71,8 @@ #include #include #include +#include +#include #include #include @@ -313,6 +315,8 @@ static inline struct task_group *task_gr #ifdef CONFIG_CGROUP_SCHED tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); +#elif defined(CONFIG_VZ_FAIRSCHED) + tg = p->fsched_node->tg; #else tg = &init_task_group; #endif @@ -520,6 +524,9 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long nr_sleeping; + unsigned long nr_stopped; + struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -607,6 +614,12 @@ static inline int cpu_of(struct rq *rq) #endif } +struct kernel_stat_glob kstat_glob; +DEFINE_SPINLOCK(kstat_glb_lock); +EXPORT_SYMBOL(kstat_glob); +EXPORT_SYMBOL(kstat_glb_lock); +static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_lat); + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -979,6 +992,220 @@ static inline void task_rq_unlock(struct spin_unlock_irqrestore(&rq->lock, *flags); } +#ifdef CONFIG_VE +struct ve_cpu_stats static_ve_cpu_stats; +EXPORT_SYMBOL(static_ve_cpu_stats); + +static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_iowait++; +} + +static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_iowait--; +} + +static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_unint++; +} + +static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_unint--; +} + +#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0) + +cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu) +{ + struct ve_cpu_stats *ve_stat; + unsigned v; + cycles_t strt, ret, cycles; + + ve_stat = VE_CPU_STATS(ve, cpu); + do { + v = read_seqcount_begin(&ve_stat->stat_lock); + ret = ve_stat->idle_time; + strt = ve_stat->strt_idle_time; + if (strt && nr_iowait_ve(ve) == 0) { + cycles = get_cycles(); + if (cycles_after(cycles, strt)) + ret += cycles - strt; + } + } while (read_seqcount_retry(&ve_stat->stat_lock, v)); + return ret; +} +EXPORT_SYMBOL(ve_sched_get_idle_time); + +cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu) +{ + struct ve_cpu_stats *ve_stat; + unsigned v; + cycles_t strt, ret, cycles; + + ve_stat = VE_CPU_STATS(ve, cpu); + do { + v = read_seqcount_begin(&ve_stat->stat_lock); + ret = ve_stat->iowait_time; + strt = ve_stat->strt_idle_time; + if (strt && nr_iowait_ve(ve) > 0) { + cycles = get_cycles(); + if (cycles_after(cycles, strt)) + ret += cycles - strt; + } + } while (read_seqcount_retry(&ve_stat->stat_lock, v)); + return ret; +} +EXPORT_SYMBOL(ve_sched_get_iowait_time); + +static void ve_stop_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles) +{ + struct ve_cpu_stats *ve_stat; + + ve_stat = VE_CPU_STATS(ve, cpu); + + write_seqcount_begin(&ve_stat->stat_lock); + if (ve_stat->strt_idle_time) { + if (cycles_after(cycles, ve_stat->strt_idle_time)) { + if (nr_iowait_ve(ve) == 0) + ve_stat->idle_time += + cycles - ve_stat->strt_idle_time; + else + ve_stat->iowait_time += + cycles - ve_stat->strt_idle_time; + } + ve_stat->strt_idle_time = 0; + } + write_seqcount_end(&ve_stat->stat_lock); +} + +static void ve_strt_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles) +{ + struct ve_cpu_stats *ve_stat; + + ve_stat = VE_CPU_STATS(ve, cpu); + + write_seqcount_begin(&ve_stat->stat_lock); + ve_stat->strt_idle_time = cycles; + write_seqcount_end(&ve_stat->stat_lock); +} + +static inline void ve_nr_running_inc(struct ve_struct *ve, int cpu, cycles_t cycles) +{ + if (++VE_CPU_STATS(ve, cpu)->nr_running == 1) + ve_stop_idle(ve, cpu, cycles); +} + +static inline void ve_nr_running_dec(struct ve_struct *ve, int cpu, cycles_t cycles) +{ + if (--VE_CPU_STATS(ve, cpu)->nr_running == 0) + ve_strt_idle(ve, cpu, cycles); +} + +void ve_sched_attach(struct ve_struct *target_ve) +{ + struct task_struct *tsk; + unsigned int cpu; + cycles_t cycles; + + tsk = current; + preempt_disable(); + cycles = get_cycles(); + cpu = task_cpu(tsk); + ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles); + ve_nr_running_inc(target_ve, cpu, cycles); + preempt_enable(); +} +EXPORT_SYMBOL(ve_sched_attach); + +static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc) +{ + struct ve_task_info *ti; + + ti = VE_TASK_INFO(p); + write_seqcount_begin(&ti->wakeup_lock); + ti->wakeup_stamp = cyc; + write_seqcount_end(&ti->wakeup_lock); +} + +static inline void update_sched_lat(struct task_struct *t, cycles_t cycles) +{ + int cpu; + cycles_t ve_wstamp; + + /* safe due to runqueue lock */ + cpu = smp_processor_id(); + ve_wstamp = t->ve_task_info.wakeup_stamp; + + if (ve_wstamp && cycles > ve_wstamp) { + KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat, + cpu, cycles - ve_wstamp); + KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve, + cpu, cycles - ve_wstamp); + } +} + +static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles) +{ +#ifdef CONFIG_FAIRSCHED + if (prev != this_pcpu()->idle) { +#else + if (prev != this_rq()->idle) { +#endif + VE_CPU_STATS(prev->ve_task_info.owner_env, + smp_processor_id())->used_time += + cycles - prev->ve_task_info.sched_time; + + prev->ve_task_info.sched_time = cycles; + } +} +#else +static inline void ve_nr_running_inc(struct ve_struct, int cpu, cycles_t cycles) +{ +} + +static inline void ve_nr_running_dec(struct ve_struct, int cpu, cycles_t cycles) +{ +} + +static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu) +{ +} + +static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles) +{ +} +#endif + +struct task_nrs_struct { + long nr_running; + long nr_unint; + long nr_stopped; + long nr_sleeping; + long nr_iowait; + long long nr_switches; +} ____cacheline_aligned_in_smp; + +unsigned long nr_zombie = 0; /* protected by tasklist_lock */ +EXPORT_SYMBOL(nr_zombie); + +atomic_t nr_dead = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_dead); + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -2023,11 +2250,21 @@ static int effective_prio(struct task_st */ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - if (task_contributes_to_load(p)) + cycles_t cycles; + +#ifdef CONFIG_VE + cycles = get_cycles(); + write_wakeup_stamp(p, cycles); + p->ve_task_info.sleep_time += cycles; +#endif + if (task_contributes_to_load(p)) { rq->nr_uninterruptible--; + ve_nr_unint_dec(VE_TASK_INFO(p)->owner_env, task_cpu(p)); + } enqueue_task(rq, p, wakeup, false); inc_nr_running(rq); + ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles); } /* @@ -2035,11 +2272,31 @@ static void activate_task(struct rq *rq, */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { - if (task_contributes_to_load(p)) + cycles_t cycles; + unsigned int cpu; + + cycles = get_cycles(); + cpu = task_cpu(p); + + p->ve_task_info.sleep_time -= cycles; + +#if 0 /* this is broken */ + if (p->state == TASK_INTERRUPTIBLE) { + rq->nr_sleeping++; + } + if (p->state == TASK_STOPPED) { + rq->nr_stopped++; + } +#endif + + if (task_contributes_to_load(p)) { rq->nr_uninterruptible++; + ve_nr_unint_inc(VE_TASK_INFO(p)->owner_env, cpu); + } dequeue_task(rq, p, sleep); dec_nr_running(rq); + ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu, cycles); } /** @@ -2327,6 +2584,7 @@ unsigned long wait_task_inactive(struct return ncsw; } +EXPORT_SYMBOL_GPL(wait_task_inactive); /*** * kick_process - kick a running thread to enter/exit the kernel @@ -2505,6 +2763,7 @@ static int try_to_wake_up(struct task_st * First fix up the nr_uninterruptible count: */ if (task_contributes_to_load(p)) { + ve_nr_unint_dec(VE_TASK_INFO(p)->owner_env, cpu); if (likely(cpu_online(orig_cpu))) rq->nr_uninterruptible--; else @@ -2748,6 +3007,10 @@ void sched_fork(struct task_struct *p, i /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif +#ifdef CONFIG_VE + /* cosmetic: sleep till wakeup below */ + p->ve_task_info.sleep_time -= get_cycles(); +#endif plist_node_init(&p->pushable_tasks, MAX_PRIO); put_cpu(); @@ -2990,6 +3253,7 @@ asmlinkage void schedule_tail(struct tas if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } +EXPORT_SYMBOL_GPL(schedule_tail); /* * context_switch - switch to the new MM and the new @@ -3061,6 +3325,7 @@ unsigned long nr_running(void) return sum; } +EXPORT_SYMBOL_GPL(nr_running); unsigned long nr_uninterruptible(void) { @@ -3078,6 +3343,7 @@ unsigned long nr_uninterruptible(void) return sum; } +EXPORT_SYMBOL_GPL(nr_uninterruptible); unsigned long long nr_context_switches(void) { @@ -3113,6 +3379,72 @@ unsigned long this_cpu_load(void) } +unsigned long nr_stopped(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_stopped; + if (unlikely((long)sum < 0)) + sum = 0; + return sum; +} +EXPORT_SYMBOL(nr_stopped); + +unsigned long nr_sleeping(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_sleeping; + if (unlikely((long)sum < 0)) + sum = 0; + return sum; +} +EXPORT_SYMBOL(nr_sleeping); + +#ifdef CONFIG_VE +unsigned long nr_running_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_running; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_running_ve); + +unsigned long nr_uninterruptible_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + sum = 0; + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_unint; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_uninterruptible_ve); + +unsigned long nr_iowait_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_iowait; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_iowait_ve); +#endif + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; @@ -3134,6 +3466,16 @@ void get_avenrun(unsigned long *loads, u loads[2] = (avenrun[2] + offset) << shift; } +void get_avenrun_ve(struct ve_struct *ve, + unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (ve->avenrun[0] + offset) << shift; + loads[1] = (ve->avenrun[1] + offset) << shift; + loads[2] = (ve->avenrun[2] + offset) << shift; +} + + + static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -3142,6 +3484,35 @@ calc_load(unsigned long load, unsigned l return load >> FSHIFT; } +#ifdef CONFIG_VE +static void calc_load_ve(void) +{ + unsigned long flags, nr_unint, nr_active; + struct ve_struct *ve; + + read_lock(&ve_list_lock); + for_each_ve(ve) { + nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve); + nr_active *= FIXED_1; + + ve->avenrun[0] = calc_load(ve->avenrun[0], EXP_1, nr_active); + ve->avenrun[1] = calc_load(ve->avenrun[1], EXP_5, nr_active); + ve->avenrun[2] = calc_load(ve->avenrun[2], EXP_15, nr_active); + } + read_unlock(&ve_list_lock); + + nr_unint = nr_uninterruptible() * FIXED_1; + spin_lock_irqsave(&kstat_glb_lock, flags); + CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); + CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); + CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); + spin_unlock_irqrestore(&kstat_glb_lock, flags); + +} +#else +#define calc_load_ve() do { } while (0) +#endif + /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. @@ -3161,6 +3532,8 @@ void calc_global_load(void) avenrun[1] = calc_load(avenrun[1], EXP_5, active); avenrun[2] = calc_load(avenrun[2], EXP_15, active); + calc_load_ve(); + calc_load_update += LOAD_FREQ; } @@ -3218,6 +3591,16 @@ static void update_cpu_load(struct rq *t sched_avg_update(this_rq); } +#ifdef CONFIG_VE +#define update_ve_cpu_time(p, time, tick) \ + do { \ + VE_CPU_STATS((p)->ve_task_info.owner_env, \ + task_cpu(p))->time += tick; \ + } while (0) +#else +#define update_ve_cpu_time(p, time, tick) do { } while (0) +#endif + #ifdef CONFIG_SMP /* @@ -3309,8 +3692,15 @@ unlock: static void pull_task(struct rq *src_rq, struct task_struct *p, struct rq *this_rq, int this_cpu) { + struct ve_struct *ve; + cycles_t cycles = get_cycles(); + + ve = VE_TASK_INFO(p)->owner_env; + deactivate_task(src_rq, p, 0); + ve_nr_running_dec(ve, task_cpu(p), cycles); set_task_cpu(p, this_cpu); + ve_nr_running_inc(ve, task_cpu(p), cycles); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); } @@ -5254,10 +5644,13 @@ void account_user_time(struct task_struc /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (TASK_NICE(p) > 0) { cpustat->nice = cputime64_add(cpustat->nice, tmp); - else + update_ve_cpu_time(p, nice, tmp); + } else { cpustat->user = cputime64_add(cpustat->user, tmp); + update_ve_cpu_time(p, user, tmp); + } cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ @@ -5314,6 +5707,7 @@ void account_system_time(struct task_str /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); + update_ve_cpu_time(p, system, tmp); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (in_serving_softirq()) @@ -5737,6 +6131,8 @@ need_resched_nonpreemptible: next = pick_next_task(rq); if (likely(prev != next)) { + cycles_t cycles = get_cycles(); + sched_info_switch(prev, next); perf_event_task_sched_out(prev, next, cpu); @@ -5744,6 +6140,22 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; +#ifdef CONFIG_VE + prev->ve_task_info.sleep_stamp = cycles; + if (prev->state == TASK_RUNNING && prev != this_rq()->idle) + write_wakeup_stamp(prev, cycles); + update_sched_lat(next, cycles); + + /* because next & prev are protected with + * runqueue lock we may not worry about + * wakeup_stamp and sched_time protection + * (same thing in 'else' branch below) + */ + update_ve_task_info(prev, cycles); + next->ve_task_info.sched_time = cycles; + write_wakeup_stamp(next, 0); +#endif + context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under @@ -5751,8 +6163,10 @@ need_resched_nonpreemptible: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { + update_ve_task_info(prev, get_cycles()); spin_unlock_irq(&rq->lock); + } post_schedule(rq); @@ -6538,7 +6952,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ - if (user && !capable(CAP_SYS_NICE)) { + if (user && !capable(CAP_SYS_ADMIN)) { if (rt_policy(policy)) { unsigned long rlim_rtprio; @@ -7049,11 +7463,16 @@ EXPORT_SYMBOL(yield); void __sched io_schedule(void) { struct rq *rq = raw_rq(); +#ifdef CONFIG_VE + struct ve_struct *ve = current->ve_task_info.owner_env; +#endif delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); current->in_iowait = 1; + ve_nr_iowait_inc(ve, task_cpu(current)); schedule(); + ve_nr_iowait_dec(ve, task_cpu(current)); current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); @@ -7064,11 +7483,16 @@ long __sched io_schedule_timeout(long ti { struct rq *rq = raw_rq(); long ret; +#ifdef CONFIG_VE + struct ve_struct *ve = current->ve_task_info.owner_env; +#endif delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); current->in_iowait = 1; + ve_nr_iowait_inc(ve, task_cpu(current)); ret = schedule_timeout(timeout); + ve_nr_iowait_dec(ve, task_cpu(current)); current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); @@ -7179,17 +7603,7 @@ void sched_show_task(struct task_struct state = p->state ? __ffs(p->state) + 1 : 0; printk(KERN_INFO "%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif + printk(KERN_CONT " %p ", p); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -7206,13 +7620,13 @@ void show_state_filter(unsigned long sta #if BITS_PER_LONG == 32 printk(KERN_INFO - " task PC stack pid father\n"); + " task taskaddr stack pid father\n"); #else printk(KERN_INFO - " task PC stack pid father\n"); + " task taskaddr stack pid father\n"); #endif read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take alot of time: @@ -7220,7 +7634,7 @@ void show_state_filter(unsigned long sta touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); touch_all_softlockup_watchdogs(); @@ -7584,13 +7998,13 @@ static void migrate_live_tasks(int src_c read_lock(&tasklist_lock); - do_each_thread(t, p) { + do_each_thread_all(t, p) { if (p == current) continue; if (task_cpu(p) == src_cpu) move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); + } while_each_thread_all(t, p); read_unlock(&tasklist_lock); } @@ -9720,6 +10134,7 @@ void __init sched_init(void) update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), __alignof__(unsigned long)); #endif + kstat_glob.sched_lat.cur = &per_cpu__glob_kstat_lat; for_each_possible_cpu(i) { struct rq *rq; @@ -9733,7 +10148,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED +#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED) /* * How much cpu bandwidth does init_task_group get? * @@ -9760,7 +10175,7 @@ void __init sched_init(void) rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED +#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED) init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); #elif defined CONFIG_USER_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); @@ -9827,6 +10242,7 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; + fairsched_init_early(); /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); @@ -9905,7 +10321,7 @@ void normalize_rt_tasks(void) struct rq *rq; read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * Only normalize user tasks: */ @@ -9936,7 +10352,7 @@ void normalize_rt_tasks(void) __task_rq_unlock(rq); spin_unlock(&p->pi_lock); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock_irqrestore(&tasklist_lock, flags); } @@ -10382,10 +10798,10 @@ static inline int tg_has_rt_tasks(struct { struct task_struct *g, *p; - do_each_thread(g, p) { + do_each_thread_ve(g, p) { if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) return 1; - } while_each_thread(g, p); + } while_each_thread_ve(g, p); return 0; } diff -urNp linux-2.6.32.48/kernel/sched_debug.c linux-2.6.32.48-openvz/kernel/sched_debug.c --- linux-2.6.32.48/kernel/sched_debug.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/sched_debug.c 2011-11-21 17:40:47.000000000 -0500 @@ -135,12 +135,12 @@ static void print_rq(struct seq_file *m, read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (!p->se.on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock_irqrestore(&tasklist_lock, flags); } diff -urNp linux-2.6.32.48/kernel/signal.c linux-2.6.32.48-openvz/kernel/signal.c --- linux-2.6.32.48/kernel/signal.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/signal.c 2011-11-21 17:40:47.000000000 -0500 @@ -33,13 +33,32 @@ #include #include #include +#include #include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. */ -static struct kmem_cache *sigqueue_cachep; +struct kmem_cache *sigqueue_cachep; +EXPORT_SYMBOL(sigqueue_cachep); + +static int sig_ve_ignored(int sig, struct siginfo *info, struct task_struct *t) +{ + struct ve_struct *ve; + + /* always allow signals from the kernel */ + if (info == SEND_SIG_FORCED || + (!is_si_special(info) && SI_FROMKERNEL(info))) + return 0; + + ve = current->ve_task_info.owner_env; + if (ve->ve_ns->pid_ns->child_reaper != t) + return 0; + if (ve_is_super(get_exec_env())) + return 0; + return !sig_user_defined(t, sig) || sig_kernel_only(sig); +} static void __user *sig_handler(struct task_struct *t, int sig) { @@ -118,7 +137,7 @@ static inline int has_pending_signals(si #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -static int recalc_sigpending_tsk(struct task_struct *t) +int recalc_sigpending_tsk(struct task_struct *t) { if (t->signal->group_stop_count > 0 || PENDING(&t->pending, &t->blocked) || @@ -143,6 +162,7 @@ void recalc_sigpending_and_wake(struct t if (recalc_sigpending_tsk(t)) signal_wake_up(t, 0); } +EXPORT_SYMBOL_GPL(recalc_sigpending_tsk); void recalc_sigpending(void) { @@ -209,8 +229,13 @@ static struct sigqueue *__sigqueue_alloc atomic_inc(&user->sigpending); if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { q = kmem_cache_alloc(sigqueue_cachep, flags); + if (q && ub_siginfo_charge(q, get_task_ub(t))) { + kmem_cache_free(sigqueue_cachep, q); + q = NULL; + } + } if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); free_uid(user); @@ -229,6 +254,7 @@ static void __sigqueue_free(struct sigqu return; atomic_dec(&q->user->sigpending); free_uid(q->user); + ub_siginfo_uncharge(q); kmem_cache_free(sigqueue_cachep, q); } @@ -409,7 +435,18 @@ still_pending: static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info) { - int sig = next_signal(pending, mask); + int sig = 0; + + /* SIGKILL must have priority, otherwise it is quite easy + * to create an unkillable process, sending sig < SIGKILL + * to self */ + if (unlikely(sigismember(&pending->signal, SIGKILL))) { + if (!sigismember(mask, SIGKILL)) + sig = SIGKILL; + } + + if (likely(!sig)) + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { @@ -532,6 +569,7 @@ void signal_wake_up(struct task_struct * if (!wake_up_state(t, mask)) kick_process(t); } +EXPORT_SYMBOL_GPL(signal_wake_up); /* * Remove signals in mask from the pending set and queue. @@ -657,7 +695,7 @@ static int prepare_signal(int sig, struc t = p; do { rm_from_queue(sigmask(SIGCONT), &t->pending); - } while_each_thread(p, t); + } while_each_thread_all(p, t); } else if (sig == SIGCONT) { unsigned int why; /* @@ -689,7 +727,7 @@ static int prepare_signal(int sig, struc state |= TASK_INTERRUPTIBLE; } wake_up_state(t, state); - } while_each_thread(p, t); + } while_each_thread_all(p, t); /* * Notify the parent with CLD_CONTINUED if we were stopped. @@ -811,7 +849,7 @@ static void complete_signal(int sig, str do { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - } while_each_thread(p, t); + } while_each_thread_all(p, t); return; } } @@ -1082,7 +1120,8 @@ int group_send_sig_info(int sig, struct int ret = check_kill_permission(sig, info, p); if (!ret && sig) - ret = do_send_sig_info(sig, info, p, true); + ret = sig_ve_ignored(sig, info, p) ? 0 : + do_send_sig_info(sig, info, p, true); return ret; } @@ -1207,7 +1246,7 @@ static int kill_something_info(int sig, int retval = 0, count = 0; struct task_struct * p; - for_each_process(p) { + for_each_process_ve(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p); @@ -1398,6 +1437,14 @@ int do_notify_parent(struct task_struct BUG_ON(!task_ptrace(tsk) && (tsk->group_leader != tsk || !thread_group_empty(tsk))); +#ifdef CONFIG_VE + /* Allow to send only SIGCHLD from VE */ + if (sig != SIGCHLD && + tsk->ve_task_info.owner_env != + tsk->parent->ve_task_info.owner_env) + sig = SIGCHLD; +#endif + info.si_signo = sig; info.si_errno = 0; /* @@ -1722,7 +1769,9 @@ static int do_signal_stop(int signr) /* Now we don't run again until woken by SIGCONT or SIGKILL */ do { + set_stop_state(current); schedule(); + clear_stop_state(current); } while (try_to_freeze()); tracehook_finish_jctl(); @@ -1784,8 +1833,6 @@ relock: * Now that we woke up, it's crucial if we're supposed to be * frozen that we freeze now before running anything substantial. */ - try_to_freeze(); - spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if @@ -2283,7 +2330,8 @@ do_send_specific(pid_t tgid, pid_t pid, * probe. No signal is actually delivered. */ if (!error && sig) { - error = do_send_sig_info(sig, info, p, false); + if (!sig_ve_ignored(sig, info, p)) + error = do_send_sig_info(sig, info, p, false); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, @@ -2688,5 +2736,5 @@ __attribute__((weak)) const char *arch_v void __init signals_init(void) { - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC|SLAB_UBC); } diff -urNp linux-2.6.32.48/kernel/softirq.c linux-2.6.32.48-openvz/kernel/softirq.c --- linux-2.6.32.48/kernel/softirq.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/softirq.c 2011-11-21 17:40:47.000000000 -0500 @@ -25,6 +25,8 @@ #include #include +#include + #define CREATE_TRACE_POINTS #include @@ -206,10 +208,14 @@ EXPORT_SYMBOL(local_bh_enable_ip); asmlinkage void __do_softirq(void) { + struct user_beancounter *ub; struct softirq_action *h; __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + struct ve_struct *envid; + + envid = set_exec_env(get_ve0()); pending = local_softirq_pending(); account_system_vtime(current); @@ -227,6 +233,7 @@ restart: h = softirq_vec; + ub = set_exec_ub(get_ub0()); do { if (pending & 1) { int prev_count = preempt_count(); @@ -249,6 +256,7 @@ restart: h++; pending >>= 1; } while (pending); + (void)set_exec_ub(ub); local_irq_disable(); @@ -262,6 +270,7 @@ restart: lockdep_softirq_exit(); account_system_vtime(current); + (void)set_exec_env(envid); __local_bh_enable(SOFTIRQ_OFFSET); } @@ -321,6 +330,7 @@ void irq_exit(void) { account_system_vtime(current); trace_hardirq_exit(); + restore_context(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); diff -urNp linux-2.6.32.48/kernel/sys.c linux-2.6.32.48-openvz/kernel/sys.c --- linux-2.6.32.48/kernel/sys.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/sys.c 2011-11-21 17:40:47.000000000 -0500 @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include #include @@ -115,6 +117,102 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); +DECLARE_MUTEX(virtinfo_sem); +EXPORT_SYMBOL(virtinfo_sem); +static struct vnotifier_block *virtinfo_chain[VIRT_TYPES]; + +void __virtinfo_notifier_register(int type, struct vnotifier_block *nb) +{ + struct vnotifier_block **p; + + for (p = &virtinfo_chain[type]; + *p != NULL && nb->priority < (*p)->priority; + p = &(*p)->next); + nb->next = *p; + smp_wmb(); + *p = nb; +} + +EXPORT_SYMBOL(__virtinfo_notifier_register); + +void virtinfo_notifier_register(int type, struct vnotifier_block *nb) +{ + down(&virtinfo_sem); + __virtinfo_notifier_register(type, nb); + up(&virtinfo_sem); +} + +EXPORT_SYMBOL(virtinfo_notifier_register); + +struct virtinfo_cnt_struct { + volatile unsigned long exit[NR_CPUS]; + volatile unsigned long entry; +}; +static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt); + +void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb) +{ + struct vnotifier_block **p; + int entry_cpu, exit_cpu; + unsigned long cnt, ent; + + down(&virtinfo_sem); + for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next); + *p = nb->next; + smp_mb(); + + for_each_cpu_mask(entry_cpu, cpu_possible_map) { + while (1) { + cnt = 0; + for_each_cpu_mask(exit_cpu, cpu_possible_map) + cnt += + per_cpu(virtcnt, entry_cpu).exit[exit_cpu]; + smp_rmb(); + ent = per_cpu(virtcnt, entry_cpu).entry; + if (cnt == ent) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ / 100); + } + } + up(&virtinfo_sem); +} + +EXPORT_SYMBOL(virtinfo_notifier_unregister); + +int virtinfo_notifier_call(int type, unsigned long n, void *data) +{ + int ret; + int entry_cpu, exit_cpu; + struct vnotifier_block *nb; + + entry_cpu = get_cpu(); + per_cpu(virtcnt, entry_cpu).entry++; + smp_wmb(); + put_cpu(); + + nb = virtinfo_chain[type]; + ret = NOTIFY_DONE; + while (nb) + { + ret = nb->notifier_call(nb, n, data, ret); + if(ret & NOTIFY_STOP_MASK) { + ret &= ~NOTIFY_STOP_MASK; + break; + } + nb = nb->next; + } + + exit_cpu = get_cpu(); + smp_wmb(); + per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++; + put_cpu(); + + return ret; +} + +EXPORT_SYMBOL(virtinfo_notifier_call); + /* * set the priority of a task * - the caller must hold the RCU read lock @@ -190,10 +288,10 @@ SYSCALL_DEFINE3(setpriority, int, which, !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread_ve(g, p) { if (__task_cred(p)->uid == who) error = set_one_prio(p, niceval, error); - while_each_thread(g, p); + } while_each_thread_ve(g, p); if (who != cred->uid) free_uid(user); /* For find_user() */ break; @@ -253,13 +351,13 @@ SYSCALL_DEFINE2(getpriority, int, which, !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread_ve(g, p) if (__task_cred(p)->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } - while_each_thread(g, p); + while_each_thread_ve(g, p); if (who != cred->uid) free_uid(user); /* for find_user() */ break; @@ -375,6 +473,27 @@ SYSCALL_DEFINE4(reboot, int, magic1, int magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + case LINUX_REBOOT_CMD_RESTART2: + set_bit(VE_REBOOT, &get_exec_env()->flags); + + case LINUX_REBOOT_CMD_HALT: + case LINUX_REBOOT_CMD_POWER_OFF: + force_sig(SIGKILL, + get_exec_env()->ve_ns->pid_ns->child_reaper); + + case LINUX_REBOOT_CMD_CAD_ON: + case LINUX_REBOOT_CMD_CAD_OFF: + return 0; + + default: + return -EINVAL; + } +#endif + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ @@ -919,8 +1038,27 @@ void do_sys_times(struct tms *tms) tms->tms_cstime = cputime_to_clock_t(cstime); } +#ifdef CONFIG_VE +unsigned long long ve_relative_clock(struct timespec * ts) +{ + unsigned long long offset = 0; + + if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec || + (ts->tv_sec == get_exec_env()->start_timespec.tv_sec && + ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec)) + offset = (unsigned long long)(ts->tv_sec - + get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC + + ts->tv_nsec - get_exec_env()->start_timespec.tv_nsec; + return nsec_to_clock_t(offset); +} +#endif + SYSCALL_DEFINE1(times, struct tms __user *, tbuf) { +#ifdef CONFIG_VE + struct timespec now; +#endif + if (tbuf) { struct tms tmp; @@ -928,8 +1066,15 @@ SYSCALL_DEFINE1(times, struct tms __user if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } +#ifndef CONFIG_VE force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); +#else + /* Compare to calculation in fs/proc/array.c */ + do_posix_clock_monotonic_gettime(&now); + force_successful_syscall_return(); + return ve_relative_clock(&now); +#endif } /* @@ -1129,7 +1274,7 @@ SYSCALL_DEFINE2(sethostname, char __user int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1178,7 +1323,7 @@ SYSCALL_DEFINE2(setdomainname, char __us int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; diff -urNp linux-2.6.32.48/kernel/sysctl.c linux-2.6.32.48-openvz/kernel/sysctl.c --- linux-2.6.32.48/kernel/sysctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/sysctl.c 2011-11-21 17:40:47.000000000 -0500 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -83,6 +84,21 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int ve_area_access_check; /* fs/namei.c */ +int ve_allow_kthreads = 1; +EXPORT_SYMBOL(ve_allow_kthreads); + +#ifdef CONFIG_MAGIC_SYSRQ +extern int sysrq_key_scancode; +#endif + +extern int alloc_fail_warn; +int decode_call_traces = 1; + +#ifdef CONFIG_VE +int glob_ve_meminfo = 0; +EXPORT_SYMBOL(glob_ve_meminfo); +#endif extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU @@ -169,6 +185,12 @@ static int proc_taint(struct ctl_table * void __user *buffer, size_t *lenp, loff_t *ppos); #endif +static int proc_dointvec_ve(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int sysctl_data_ve(struct ctl_table *table, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); + static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { @@ -178,9 +200,31 @@ static struct ctl_table_header root_tabl .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }; -static struct ctl_table_root sysctl_table_root = { + +#ifdef CONFIG_VE +static int sysctl_root_perms(struct ctl_table_root *root, + struct nsproxy *namespaces, struct ctl_table *table) +{ + if (ve_is_super(get_exec_env())) + return table->mode; + else + return table->mode & ~0222; +} + +static struct ctl_table_root sysctl_table_groot = { .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), + .default_set.list = LIST_HEAD_INIT(sysctl_table_groot.default_set.list), + .default_set.parent = &sysctl_table_root.default_set, +}; +#else +#define sysctl_root_perms NULL +#define sysctl_table_groot sysctl_table_root +#endif + +static struct ctl_table_root sysctl_table_root = { + .root_list = LIST_HEAD_INIT(sysctl_table_groot.root_list), .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), + .permissions = sysctl_root_perms, }; static struct ctl_table kern_table[]; @@ -504,6 +548,20 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .procname = "silence-level", + .data = &console_silence_loglevel, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "alloc_fail_warn", + .data = &alloc_fail_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef __hppa__ { .ctl_name = KERN_HPPA_PWRSW, @@ -699,6 +757,24 @@ static struct ctl_table kern_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#ifdef CONFIG_VE + { + .procname = "ve_meminfo", + .data = &glob_ve_meminfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .procname = "sysrq-key", + .data = &sysrq_key_scancode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .ctl_name = KERN_PANIC_ON_OOPS, .procname = "panic_on_oops", @@ -824,10 +900,13 @@ static struct ctl_table kern_table[] = { { .ctl_name = KERN_RANDOMIZE, .procname = "randomize_va_space", - .data = &randomize_va_space, + .data = &_randomize_va_space, + .extra1 = (void *)offsetof(struct ve_struct, + _randomize_va_space), .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_ve, + .strategy = &sysctl_data_ve, }, #endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) @@ -1424,6 +1503,21 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif + { + .procname = "vsyscall", + .data = &sysctl_at_vsyscall, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "odirect_enable", + .data = &odirect_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, /* * NOTE: do not add new entries to this table unless you have read @@ -1600,6 +1694,13 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { + { + .procname = "decode_call_traces", + .data = &decode_call_traces, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #if defined(CONFIG_X86) || defined(CONFIG_PPC) { .ctl_name = CTL_UNNUMBERED, @@ -2150,10 +2251,27 @@ struct ctl_table_header *__register_sysc struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, struct ctl_table *table) { + if (!ve_is_super(get_exec_env())) { + WARN_ON(1); + return NULL; + } + return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, path, table); } +struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path, + struct ctl_table *table, int virtual_handler) +{ + if (!ve_is_super(get_exec_env())) { + WARN_ON(1); + return NULL; + } + + return __register_sysctl_paths(&sysctl_table_groot, current->nsproxy, + path, table); +} + /** * register_sysctl_table - register a sysctl table hierarchy * @table: the top-level table structure @@ -2170,6 +2288,14 @@ struct ctl_table_header *register_sysctl return register_sysctl_paths(null_path, table); } +struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table, + int virtual_handler) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_glob_paths(null_path, table, virtual_handler); +} + /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl_table @@ -2231,6 +2357,18 @@ struct ctl_table_header *register_sysctl return NULL; } +struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table, + int vh) +{ + return NULL; +} + +struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path, + struct ctl_table *table, int vh) +{ + return NULL; +} + void unregister_sysctl_table(struct ctl_table_header * table) { } @@ -2902,6 +3040,25 @@ static int proc_do_cad_pid(struct ctl_ta return 0; } +#ifdef CONFIG_VE +static int proc_dointvec_ve(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp_table; + + tmp_table = *table; + tmp_table.data = (char *)get_exec_env() + (unsigned long)table->extra1; + + return proc_dointvec(&tmp_table, write, buffer, lenp, ppos); +} +#else +static int proc_dointvec_ve(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec(table, write, buffer, lenp, ppos); +} +#endif /* CONFIG_VE */ + #else /* CONFIG_PROC_FS */ int proc_dostring(struct ctl_table *table, int write, @@ -2996,6 +3153,27 @@ int sysctl_data(struct ctl_table *table, return 1; } +#ifdef CONFIG_VE +static int sysctl_data_ve(struct ctl_table *table, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct ctl_table tmp_table; + + tmp_table = *table; + tmp_table.data = (char *)get_exec_env() + (unsigned long)table->extra1; + + return sysctl_data(&tmp_table, oldval, oldlenp, newval, newlen); +} +#else +static int sysctl_data_ve(struct ctl_table *table, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + return sysctl_data(table, oldval, oldlenp, newval, newlen); +} +#endif + /* The generic string strategy routine: */ int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, @@ -3175,6 +3353,13 @@ int sysctl_data(struct ctl_table *table, return -ENOSYS; } +static int sysctl_data_ve(struct ctl_table *table, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + return -ENOSYS; +} + int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) @@ -3236,6 +3421,56 @@ static int deprecated_sysctl_warning(str return 0; } +#ifdef CONFIG_PID_NS +#include + +static int proc_pid_ns_hide_child(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int tmp, res; + + tmp = (current->nsproxy->pid_ns->flags & PID_NS_HIDE_CHILD) ? 1 : 0; + + res = __do_proc_dointvec(&tmp, table, write, buffer, + lenp, ppos, NULL, NULL); + if (res || !write) + return res; + + if (tmp) + current->nsproxy->pid_ns->flags |= PID_NS_HIDE_CHILD; + else + current->nsproxy->pid_ns->flags &= ~PID_NS_HIDE_CHILD; + return 0; +} + +static struct ctl_table pid_ns_kern_table[] = { + { + .procname = "pid_ns_hide_child", + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_pid_ns_hide_child, + }, + {} +}; + +static struct ctl_table pid_ns_root_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = pid_ns_kern_table, + }, + {} +}; + +static __init int pid_ns_sysctl_init(void) +{ + register_sysctl_table(pid_ns_root_table); + return 0; +} +postcore_initcall(pid_ns_sysctl_init); +#endif /* CONFIG_PID_NS */ + /* * No sense putting this after each symbol definition, twice, * exception granted :-) @@ -3249,7 +3484,9 @@ EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(register_sysctl_table); +EXPORT_SYMBOL(register_sysctl_glob_table); EXPORT_SYMBOL(register_sysctl_paths); +EXPORT_SYMBOL(register_sysctl_glob_paths); EXPORT_SYMBOL(sysctl_intvec); EXPORT_SYMBOL(sysctl_jiffies); EXPORT_SYMBOL(sysctl_ms_jiffies); diff -urNp linux-2.6.32.48/kernel/sys_ni.c linux-2.6.32.48-openvz/kernel/sys_ni.c --- linux-2.6.32.48/kernel/sys_ni.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/sys_ni.c 2011-11-21 17:40:47.000000000 -0500 @@ -179,3 +179,17 @@ cond_syscall(sys_eventfd2); /* performance counters: */ cond_syscall(sys_perf_event_open); +cond_syscall(sys_getluid); +cond_syscall(sys_setluid); +cond_syscall(sys_setublimit); +cond_syscall(compat_sys_setublimit); +cond_syscall(sys_ubstat); +cond_syscall(compat_sys_lutime); + +/* fairsched compat */ +cond_syscall(sys_fairsched_mknod); +cond_syscall(sys_fairsched_rmnod); +cond_syscall(sys_fairsched_mvpr); +cond_syscall(sys_fairsched_vcpus); +cond_syscall(sys_fairsched_chwt); +cond_syscall(sys_fairsched_rate); diff -urNp linux-2.6.32.48/kernel/taskstats.c linux-2.6.32.48-openvz/kernel/taskstats.c --- linux-2.6.32.48/kernel/taskstats.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/taskstats.c 2011-11-21 17:40:47.000000000 -0500 @@ -254,7 +254,7 @@ static int fill_tgid(pid_t tgid, struct stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; - } while_each_thread(first, tsk); + } while_each_thread_all(first, tsk); unlock_task_sighand(first, &flags); rc = 0; diff -urNp linux-2.6.32.48/kernel/time/timekeeping.c linux-2.6.32.48-openvz/kernel/time/timekeeping.c --- linux-2.6.32.48/kernel/time/timekeeping.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/time/timekeeping.c 2011-11-21 17:40:47.000000000 -0500 @@ -158,6 +158,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOC * used instead. */ struct timespec xtime __attribute__ ((aligned (16))); +EXPORT_SYMBOL_GPL(xtime); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static struct timespec total_sleep_time; diff -urNp linux-2.6.32.48/kernel/time.c linux-2.6.32.48-openvz/kernel/time.c --- linux-2.6.32.48/kernel/time.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/time.c 2011-11-21 17:40:47.000000000 -0500 @@ -610,10 +610,12 @@ EXPORT_SYMBOL(jiffies_to_clock_t); unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 + WARN_ON((long)x < 0); if (x >= ~0UL / (HZ / USER_HZ)) return ~0UL; return x * (HZ / USER_HZ); #else + WARN_ON((long)x < 0); /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; @@ -626,6 +628,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies); u64 jiffies_64_to_clock_t(u64 x) { + WARN_ON((s64)x < 0); #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ x = div_u64(x * USER_HZ, HZ); @@ -648,6 +651,7 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t); u64 nsec_to_clock_t(u64 x) { + WARN_ON((s64)x < 0); #if (NSEC_PER_SEC % USER_HZ) == 0 return div_u64(x, NSEC_PER_SEC / USER_HZ); #elif (USER_HZ % 512) == 0 diff -urNp linux-2.6.32.48/kernel/timer.c linux-2.6.32.48-openvz/kernel/timer.c --- linux-2.6.32.48/kernel/timer.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/timer.c 2011-11-21 17:40:47.000000000 -0500 @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -1000,6 +1001,7 @@ static inline void __run_timers(struct t spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); + struct ve_struct *ve; #ifdef CONFIG_LOCKDEP /* @@ -1023,7 +1025,9 @@ static inline void __run_timers(struct t lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer); + ve = set_exec_env(get_ve0()); fn(data); + (void)set_exec_env(ve); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1447,20 +1451,35 @@ int do_sysinfo(struct sysinfo *info) unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; struct timespec tp; + struct ve_struct *ve; memset(info, 0, sizeof(struct sysinfo)); + ve = get_exec_env(); ktime_get_ts(&tp); monotonic_to_bootbased(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); + if (ve_is_super(ve)) { + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - info->procs = nr_threads; + info->procs = nr_threads; + } else { + info->uptime -= ve->start_timespec.tv_sec; + + info->procs = atomic_read(&ve->pcounter); + + get_avenrun_ve(ve, info->loads, 0, SI_LOAD_SHIFT - FSHIFT); + } si_meminfo(info); si_swapinfo(info); +#ifdef CONFIG_BEANCOUNTERS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info) + & NOTIFY_FAIL) + return -ENOMSG; +#endif /* * If the sum of all the available memory (i.e. ram + swap) * is less than can be stored in a 32 bit unsigned long then diff -urNp linux-2.6.32.48/kernel/trace/ftrace.c linux-2.6.32.48-openvz/kernel/trace/ftrace.c --- linux-2.6.32.48/kernel/trace/ftrace.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/trace/ftrace.c 2011-11-21 17:40:47.000000000 -0500 @@ -3102,7 +3102,7 @@ static int alloc_retstack_tasklist(struc } read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + do_each_thread_all(g, t) { if (start == end) { ret = -EAGAIN; goto unlock; @@ -3116,7 +3116,7 @@ static int alloc_retstack_tasklist(struc smp_wmb(); t->ret_stack = ret_stack_list[start++]; } - } while_each_thread(g, t); + } while_each_thread_all(g, t); unlock: read_unlock_irqrestore(&tasklist_lock, flags); diff -urNp linux-2.6.32.48/kernel/tracepoint.c linux-2.6.32.48-openvz/kernel/tracepoint.c --- linux-2.6.32.48/kernel/tracepoint.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/tracepoint.c 2011-11-21 17:40:47.000000000 -0500 @@ -596,11 +596,11 @@ void syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + do_each_thread_ve(g, t) { /* Skip kernel threads. */ if (t->mm) set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); + } while_each_thread_ve(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } sys_tracepoint_refcount++; @@ -614,9 +614,9 @@ void syscall_unregfunc(void) sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + do_each_thread_ve(g, t) { clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); + } while_each_thread_ve(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } } diff -urNp linux-2.6.32.48/kernel/user.c linux-2.6.32.48-openvz/kernel/user.c --- linux-2.6.32.48/kernel/user.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/user.c 2011-11-21 17:40:47.000000000 -0500 @@ -174,13 +174,14 @@ struct user_struct *alloc_uid(struct use out_unlock: return NULL; } +EXPORT_SYMBOL_GPL(alloc_uid); static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); @@ -192,5 +193,6 @@ static int __init uid_cache_init(void) return 0; } +EXPORT_SYMBOL_GPL(free_uid); module_init(uid_cache_init); diff -urNp linux-2.6.32.48/kernel/user_namespace.c linux-2.6.32.48-openvz/kernel/user_namespace.c --- linux-2.6.32.48/kernel/user_namespace.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/user_namespace.c 2011-11-21 17:40:47.000000000 -0500 @@ -59,6 +59,7 @@ int create_user_ns(struct cred *new) return 0; } +EXPORT_SYMBOL(create_user_ns); /* * Deferred destructor for a user namespace. This is required because diff -urNp linux-2.6.32.48/kernel/utsname_sysctl.c linux-2.6.32.48-openvz/kernel/utsname_sysctl.c --- linux-2.6.32.48/kernel/utsname_sysctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/utsname_sysctl.c 2011-11-21 17:40:47.000000000 -0500 @@ -26,6 +26,10 @@ static void *get_uts(ctl_table *table, i down_read(&uts_sem); else down_write(&uts_sem); + + if (strcmp(table->procname, "virt_osrelease") == 0) + return virt_utsname.release; + return which; } @@ -126,19 +130,27 @@ static struct ctl_table uts_kern_table[] {} }; -static struct ctl_table uts_root_table[] = { +static struct ctl_table uts_virt_osrelease_table[] = { { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0555, - .child = uts_kern_table, + .procname = "virt_osrelease", + .data = virt_utsname.release, + .maxlen = sizeof(virt_utsname.release), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = sysctl_uts_string, }, {} }; +static struct ctl_path uts_path[] = { + { .ctl_name = CTL_KERN, .procname = "kernel", }, + { } +}; + static int __init utsname_sysctl_init(void) { - register_sysctl_table(uts_root_table); + register_sysctl_glob_paths(uts_path, uts_kern_table, 1); + register_sysctl_paths(uts_path, uts_virt_osrelease_table); return 0; } diff -urNp linux-2.6.32.48/kernel/ve/hooks.c linux-2.6.32.48-openvz/kernel/ve/hooks.c --- linux-2.6.32.48/kernel/ve/hooks.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ve/hooks.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,114 @@ +/* + * linux/kernel/ve/hooks.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include + +static struct list_head ve_hooks[VE_MAX_CHAINS]; +static DECLARE_RWSEM(ve_hook_sem); + +void ve_hook_register(int chain, struct ve_hook *vh) +{ + struct list_head *lh; + struct ve_hook *tmp; + + BUG_ON(chain > VE_MAX_CHAINS); + + down_write(&ve_hook_sem); + list_for_each(lh, &ve_hooks[chain]) { + tmp = list_entry(lh, struct ve_hook, list); + if (vh->priority < tmp->priority) + break; + } + + list_add_tail(&vh->list, lh); + up_write(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_register); + +void ve_hook_unregister(struct ve_hook *vh) +{ + down_write(&ve_hook_sem); + list_del(&vh->list); + up_write(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_unregister); + +static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve) +{ + int err; + + err = 0; + if (try_module_get(vh->owner)) { + err = vh->init(ve); + module_put(vh->owner); + } + return err; +} + +static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve) +{ + if (vh->fini != NULL && try_module_get(vh->owner)) { + vh->fini(ve); + module_put(vh->owner); + } +} + +int ve_hook_iterate_init(int chain, void *ve) +{ + struct ve_hook *vh; + int err; + + err = 0; + + down_read(&ve_hook_sem); + list_for_each_entry(vh, &ve_hooks[chain], list) + if ((err = ve_hook_init(vh, ve)) < 0) + break; + + if (err) + list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list) + ve_hook_fini(vh, ve); + + up_read(&ve_hook_sem); + return err; +} + +EXPORT_SYMBOL(ve_hook_iterate_init); + +void ve_hook_iterate_fini(int chain, void *ve) +{ + struct ve_hook *vh; + + down_read(&ve_hook_sem); + list_for_each_entry_reverse(vh, &ve_hooks[chain], list) + ve_hook_fini(vh, ve); + up_read(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_iterate_fini); + +static int __init ve_hooks_init(void) +{ + int i; + + for (i = 0; i < VE_MAX_CHAINS; i++) + INIT_LIST_HEAD(&ve_hooks[i]); + return 0; +} + +core_initcall(ve_hooks_init); + diff -urNp linux-2.6.32.48/kernel/ve/Makefile linux-2.6.32.48-openvz/kernel/ve/Makefile --- linux-2.6.32.48/kernel/ve/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ve/Makefile 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,17 @@ +# +# +# kernel/ve/Makefile +# +# Copyright (C) 2000-2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-$(CONFIG_VE) = ve.o veowner.o hooks.o +obj-$(CONFIG_VZ_WDOG) += vzwdog.o +obj-$(CONFIG_VE_CALLS) += vzmon.o + +vzmon-objs = vecalls.o + +obj-$(CONFIG_VZ_DEV) += vzdev.o +obj-$(CONFIG_VZ_EVENT) += vzevent.o diff -urNp linux-2.6.32.48/kernel/ve/ve.c linux-2.6.32.48-openvz/kernel/ve/ve.c --- linux-2.6.32.48/kernel/ve/ve.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ve/ve.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,160 @@ +/* + * linux/kernel/ve/ve.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * 've.c' helper file performing VE sub-system initialization + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +unsigned long vz_rstamp = 0x37e0f59d; + +#ifdef CONFIG_MODULES +struct module no_module = { .state = MODULE_STATE_GOING }; +EXPORT_SYMBOL(no_module); +#endif + +#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS) +void (*do_env_free_hook)(struct ve_struct *ve); +EXPORT_SYMBOL(do_env_free_hook); + +void do_env_free(struct ve_struct *env) +{ + BUG_ON(atomic_read(&env->pcounter) > 0); + BUG_ON(env->is_running); + + preempt_disable(); + do_env_free_hook(env); + preempt_enable(); +} +EXPORT_SYMBOL(do_env_free); +#endif + +int (*do_ve_enter_hook)(struct ve_struct *ve, unsigned int flags); +EXPORT_SYMBOL(do_ve_enter_hook); + +struct ve_struct ve0 = { + .counter = ATOMIC_INIT(1), + .pcounter = ATOMIC_INIT(1), + .ve_list = LIST_HEAD_INIT(ve0.ve_list), + .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh), + .start_jiffies = INITIAL_JIFFIES, + .ve_ns = &init_nsproxy, + .ve_netns = &init_net, + .user_ns = &init_user_ns, + .is_running = 1, + .op_sem = __RWSEM_INITIALIZER(ve0.op_sem), +#ifdef CONFIG_VE_IPTABLES + .ipt_mask = VE_IP_ALL, + ._iptables_modules = VE_IP_ALL, +#endif + .features = -1, + ._randomize_va_space = +#ifdef CONFIG_COMPAT_BRK + 1, +#else + 2, +#endif +}; + +EXPORT_SYMBOL(ve0); + +LIST_HEAD(ve_list_head); +rwlock_t ve_list_lock = RW_LOCK_UNLOCKED; + +LIST_HEAD(ve_cleanup_list); +DEFINE_SPINLOCK(ve_cleanup_lock); +struct task_struct *ve_cleanup_thread; + +EXPORT_SYMBOL(ve_list_lock); +EXPORT_SYMBOL(ve_list_head); +EXPORT_SYMBOL(ve_cleanup_lock); +EXPORT_SYMBOL(ve_cleanup_list); +EXPORT_SYMBOL(ve_cleanup_thread); + +static DEFINE_PER_CPU(struct ve_cpu_stats, ve0_cpustats); +static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, ve0_lat_stats); + +void init_ve0(void) +{ + struct ve_struct *ve; + + ve = get_ve0(); + ve->cpu_stats = &per_cpu__ve0_cpustats; + ve->sched_lat_ve.cur = &per_cpu__ve0_lat_stats; + list_add(&ve->ve_list, &ve_list_head); +} + +void ve_cleanup_schedule(struct ve_struct *ve) +{ + BUG_ON(ve_cleanup_thread == NULL); + + spin_lock(&ve_cleanup_lock); + list_add_tail(&ve->cleanup_list, &ve_cleanup_list); + spin_unlock(&ve_cleanup_lock); + + wake_up_process(ve_cleanup_thread); +} + +#ifdef CONFIG_BLK_CGROUP +extern int blkiocg_set_weight(struct cgroup *cgroup, u64 val); + +static u64 ioprio_weight[VE_IOPRIO_MAX] = {200, 275, 350, 425, 500, 575, 650, 725}; + +int ve_set_ioprio(int veid, int ioprio) +{ + struct ve_struct *ve; + int ret; + + if (ioprio < VE_IOPRIO_MIN || ioprio >= VE_IOPRIO_MAX) + return -ERANGE; + + ret = -ESRCH; + read_lock(&ve_list_lock); + for_each_ve(ve) { + if (ve->veid != veid) + continue; + ret = blkiocg_set_weight(ve->ve_cgroup, ioprio_weight[ioprio]); + break; + } + read_unlock(&ve_list_lock); + + return ret; +} +#else +int ve_set_ioprio(int veid, int ioprio) +{ + return -EINVAL; +} +#endif /* CONFIG_BLK_CGROUP */ diff -urNp linux-2.6.32.48/kernel/ve/vecalls.c linux-2.6.32.48-openvz/kernel/ve/vecalls.c --- linux-2.6.32.48/kernel/ve/vecalls.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ve/vecalls.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,2335 @@ +/* + * linux/kernel/ve/vecalls.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + */ + +/* + * 'vecalls.c' is file with basic VE support. It provides basic primities + * along with initialization script + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#ifdef CONFIG_VZ_FAIRSCHED +#include +#endif + +#include +#include +#include + +int nr_ve = 1; /* One VE always exists. Compatibility with vestat */ +EXPORT_SYMBOL(nr_ve); + +static int do_env_enter(struct ve_struct *ve, unsigned int flags); +static int alloc_ve_tty_drivers(struct ve_struct* ve); +static void free_ve_tty_drivers(struct ve_struct* ve); +static int register_ve_tty_drivers(struct ve_struct* ve); +static void unregister_ve_tty_drivers(struct ve_struct* ve); +static int init_ve_tty_drivers(struct ve_struct *); +static void fini_ve_tty_drivers(struct ve_struct *); +static void clear_termios(struct tty_driver* driver ); + +static void vecalls_exit(void); + +struct ve_struct *__find_ve_by_id(envid_t veid) +{ + struct ve_struct *ve; + + for_each_ve(ve) { + if (ve->veid == veid) + return ve; + } + return NULL; +} +EXPORT_SYMBOL(__find_ve_by_id); + +struct ve_struct *get_ve_by_id(envid_t veid) +{ + struct ve_struct *ve; + read_lock(&ve_list_lock); + ve = __find_ve_by_id(veid); + get_ve(ve); + read_unlock(&ve_list_lock); + return ve; +} +EXPORT_SYMBOL(get_ve_by_id); + +/* + * real_put_ve() MUST be used instead of put_ve() inside vecalls. + */ +static void real_do_env_free(struct ve_struct *ve); +static inline void real_put_ve(struct ve_struct *ve) +{ + if (ve && atomic_dec_and_test(&ve->counter)) { + BUG_ON(atomic_read(&ve->pcounter) > 0); + BUG_ON(ve->is_running); + real_do_env_free(ve); + } +} + +static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf) +{ + struct ve_struct *ve; + struct vz_cpu_stat *vstat; + int retval; + int i, cpu; + unsigned long tmp; + + if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid)) + return -EPERM; + if (veid == 0) + return -ESRCH; + + vstat = kzalloc(sizeof(*vstat), GFP_KERNEL); + if (!vstat) + return -ENOMEM; + + retval = -ESRCH; + read_lock(&ve_list_lock); + ve = __find_ve_by_id(veid); + if (ve == NULL) + goto out_unlock; + for_each_online_cpu(cpu) { + struct ve_cpu_stats *st; + + st = VE_CPU_STATS(ve, cpu); + vstat->user_jif += (unsigned long)cputime64_to_clock_t(st->user); + vstat->nice_jif += (unsigned long)cputime64_to_clock_t(st->nice); + vstat->system_jif += (unsigned long)cputime64_to_clock_t(st->system); + vstat->idle_clk += ve_sched_get_idle_time(ve, cpu); + } + vstat->uptime_clk = get_cycles() - ve->start_cycles; + vstat->uptime_jif = (unsigned long)cputime64_to_clock_t( + get_jiffies_64() - ve->start_jiffies); + for (i = 0; i < 3; i++) { + tmp = ve->avenrun[i] + (FIXED_1/200); + vstat->avenrun[i].val_int = LOAD_INT(tmp); + vstat->avenrun[i].val_frac = LOAD_FRAC(tmp); + } + read_unlock(&ve_list_lock); + + retval = 0; + if (copy_to_user(buf, vstat, sizeof(*vstat))) + retval = -EFAULT; +out_free: + kfree(vstat); + return retval; + +out_unlock: + read_unlock(&ve_list_lock); + goto out_free; +} + +static int real_setdevperms(envid_t veid, unsigned type, + dev_t dev, unsigned mask) +{ + struct ve_struct *ve; + int err; + + if (!capable_setveid() || veid == 0) + return -EPERM; + + if ((ve = get_ve_by_id(veid)) == NULL) + return -ESRCH; + + down_read(&ve->op_sem); + err = -ESRCH; + if (ve->is_running) + err = set_device_perms_ve(ve, type, dev, mask); + up_read(&ve->op_sem); + real_put_ve(ve); + return err; +} + +/********************************************************************** + ********************************************************************** + * + * VE start: subsystems + * + ********************************************************************** + **********************************************************************/ + +static int prepare_proc_root(struct ve_struct *ve) +{ + struct proc_dir_entry *de; + + de = kzalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL); + if (de == NULL) + return -ENOMEM; + + memcpy(de + 1, "/proc", 6); + de->name = (char *)(de + 1); + de->namelen = 5; + de->mode = S_IFDIR | S_IRUGO | S_IXUGO; + de->nlink = 2; + atomic_set(&de->count, 1); + + ve->proc_root = de; + return 0; +} + +#ifdef CONFIG_PROC_FS +static int init_ve_proc(struct ve_struct *ve) +{ + int err; + + err = prepare_proc_root(ve); + if (err) + goto out_root; + + err = register_ve_fs_type(ve, &proc_fs_type, + &ve->proc_fstype, &ve->proc_mnt); + if (err) + goto out_reg; + +#ifdef CONFIG_PRINTK + proc_create("kmsg", S_IRUSR, ve->proc_root, &proc_kmsg_operations); +#endif + proc_mkdir("vz", ve->proc_root); + + ve->ve_ns->pid_ns->proc_mnt = mntget(ve->proc_mnt); + return 0; + +out_reg: + /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */ + ; +out_root: + return err; +} + +static void fini_ve_proc(struct ve_struct *ve) +{ + remove_proc_entry("vz", ve->proc_root); + remove_proc_entry("kmsg", ve->proc_root); + unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); + ve->proc_mnt = NULL; +} + +static void free_ve_proc(struct ve_struct *ve) +{ + /* proc filesystem frees proc_dir_entries on remove_proc_entry() only, + so we check that everything was removed and not lost */ + if (ve->proc_root && ve->proc_root->subdir) { + struct proc_dir_entry *p = ve->proc_root; + printk(KERN_WARNING "CT: %d: proc entry /proc", ve->veid); + while ((p = p->subdir) != NULL) + printk("/%s", p->name); + printk(" is not removed!\n"); + } + + kfree(ve->proc_root); + kfree(ve->proc_fstype); + + ve->proc_fstype = NULL; + ve->proc_root = NULL; +} +#else +#define init_ve_proc(ve) (0) +#define fini_ve_proc(ve) do { } while (0) +#define free_ve_proc(ve) do { } while (0) +#endif + +#ifdef CONFIG_UNIX98_PTYS +#include + +/* + * DEVPTS needs a virtualization: each environment should see each own list of + * pseudo-terminals. + * To implement it we need to have separate devpts superblocks for each + * VE, and each VE should mount its own one. + * Thus, separate vfsmount structures are required. + * To minimize intrusion into vfsmount lookup code, separate file_system_type + * structures are created. + * + * In addition to this, patch fo character device itself is required, as file + * system itself is used only for MINOR/MAJOR lookup. + */ + +static int init_ve_devpts(struct ve_struct *ve) +{ + return register_ve_fs_type(ve, &devpts_fs_type, + &ve->devpts_fstype, &ve->devpts_mnt); +} + +static void fini_ve_devpts(struct ve_struct *ve) +{ + unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt); +} +#else +#define init_ve_devpts(ve) (0) +#define fini_ve_devpts(ve) do { } while (0) +#endif + +static int init_ve_shmem(struct ve_struct *ve) +{ + return register_ve_fs_type(ve, + &tmpfs_fs_type, + &ve->shmem_fstype, + &ve->shmem_mnt); +} + +static void fini_ve_shmem(struct ve_struct *ve) +{ + unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt); + /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */ + ve->shmem_mnt = NULL; +} + +#ifdef CONFIG_SYSFS +static int init_ve_sysfs_root(struct ve_struct *ve) +{ + struct sysfs_dirent *sysfs_root; + + sysfs_root = kzalloc(sizeof(struct sysfs_dirent), GFP_KERNEL); + if (sysfs_root == NULL) + return -ENOMEM; + sysfs_root->s_name = ""; + atomic_set(&sysfs_root->s_count, 1); + sysfs_root->s_flags = SYSFS_DIR; + sysfs_root->s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + sysfs_root->s_ino = 1; + + ve->_sysfs_root = sysfs_root; + return 0; +} +#endif + +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) +extern struct device_attribute ve_net_class_attributes[]; +static inline int init_ve_netclass(void) +{ + struct class *nc; + int err; + + nc = kzalloc(sizeof(*nc), GFP_KERNEL); + if (!nc) + return -ENOMEM; + + nc->name = net_class.name; + nc->dev_release = net_class.dev_release; + nc->dev_uevent = net_class.dev_uevent; + nc->dev_attrs = ve_net_class_attributes; + + err = class_register(nc); + if (!err) { + get_exec_env()->net_class = nc; + return 0; + } + kfree(nc); + return err; +} + +static inline void fini_ve_netclass(void) +{ + struct ve_struct *ve = get_exec_env(); + + class_unregister(ve->net_class); + kfree(ve->net_class); + ve->net_class = NULL; +} +#else +static inline int init_ve_netclass(void) { return 0; } +static inline void fini_ve_netclass(void) { ; } +#endif + +static const struct { + unsigned minor; + char *name; +} mem_class_devices [] = { + {3, "null"}, + {5, "zero"}, + {7, "full"}, + {8, "random"}, + {9, "urandom"}, + {0, NULL}, +}; + +static int init_ve_mem_class(void) +{ + int i; + struct class *ve_mem_class; + + ve_mem_class = class_create(THIS_MODULE, "mem"); + if (IS_ERR(ve_mem_class)) + return -ENOMEM; + + for (i = 0; mem_class_devices[i].name; i++) + device_create(ve_mem_class, NULL, + MKDEV(MEM_MAJOR, mem_class_devices[i].minor), + NULL, mem_class_devices[i].name); + + get_exec_env()->mem_class = ve_mem_class; + return 0; +} + + +void fini_ve_mem_class(void) +{ + int i; + struct class *ve_mem_class = get_exec_env()->mem_class; + + for (i = 0; mem_class_devices[i].name; i++) + device_destroy(ve_mem_class, + MKDEV(MEM_MAJOR, mem_class_devices[i].minor)); + class_destroy(ve_mem_class); +} + +static int init_ve_sysfs(struct ve_struct *ve) +{ + int err; + +#ifdef CONFIG_SYSFS + err = 0; + if (ve->features & VE_FEATURE_SYSFS) { + err = init_ve_sysfs_root(ve); + if (err != 0) + goto out; + err = register_ve_fs_type(ve, + &sysfs_fs_type, + &ve->sysfs_fstype, + &ve->sysfs_mnt); + if (err != 0) + goto out_fs_type; + } +#endif + + err = classes_init(); + if (err != 0) + goto err_classes; + + err = devices_init(); + if (err != 0) + goto err_devices; + + err = init_ve_netclass(); + if (err != 0) + goto err_net; + + err = init_ve_tty_class(); + if (err != 0) + goto err_tty; + + err = init_ve_mem_class(); + if (err != 0) + goto err_mem; + + return 0; + +err_mem: + fini_ve_tty_class(); +err_tty: + fini_ve_netclass(); +err_net: + devices_fini(); +err_devices: + classes_fini(); +err_classes: +#ifdef CONFIG_SYSFS + unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); + /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ +out_fs_type: + kfree(ve->_sysfs_root); + ve->_sysfs_root = NULL; +out: +#endif + return err; +} + +static void fini_ve_sysfs(struct ve_struct *ve) +{ + fini_ve_mem_class(); + fini_ve_tty_class(); + fini_ve_netclass(); + devices_fini(); + classes_fini(); +#ifdef CONFIG_SYSFS + unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); + ve->sysfs_mnt = NULL; + kfree(ve->_sysfs_root); + ve->_sysfs_root = NULL; + /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ +#endif +} + +static void free_ve_filesystems(struct ve_struct *ve) +{ +#ifdef CONFIG_SYSFS + kfree(ve->sysfs_fstype); + ve->sysfs_fstype = NULL; +#endif + kfree(ve->shmem_fstype); + ve->shmem_fstype = NULL; + + kfree(ve->devpts_fstype); + ve->devpts_fstype = NULL; + + free_ve_proc(ve); +} + +static int init_printk(struct ve_struct *ve) +{ + struct ve_prep_printk { + wait_queue_head_t log_wait; + unsigned log_start; + unsigned log_end; + unsigned logged_chars; + } *tmp; + + tmp = kzalloc(sizeof(struct ve_prep_printk), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + init_waitqueue_head(&tmp->log_wait); + ve->_log_wait = &tmp->log_wait; + ve->_log_start = &tmp->log_start; + ve->_log_end = &tmp->log_end; + ve->_logged_chars = &tmp->logged_chars; + /* ve->log_buf will be initialized later by ve_log_init() */ + return 0; +} + +static void fini_printk(struct ve_struct *ve) +{ + /* + * there is no spinlock protection here because nobody can use + * log_buf at the moments when this code is called. + */ + kfree(ve->log_buf); + kfree(ve->_log_wait); +} + +static void fini_venet(struct ve_struct *ve) +{ +#ifdef CONFIG_INET + tcp_v4_kill_ve_sockets(ve); + synchronize_net(); +#endif +} + +static int init_ve_sched(struct ve_struct *ve) +{ + int err; + + err = fairsched_new_node(ve->veid, 0); + if (err == 0) + ve_sched_attach(ve); + + return err; +} + +static void fini_ve_sched(struct ve_struct *ve) +{ + fairsched_drop_node(ve->veid); +} + +/* + * Namespaces + */ + +static inline int init_ve_namespaces(struct ve_struct *ve, + struct nsproxy **old) +{ + int err; + struct task_struct *tsk; + struct nsproxy *cur; + + tsk = current; + cur = tsk->nsproxy; + + err = copy_namespaces(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID, + tsk, 1); + if (err < 0) + return err; + + ve->ve_ns = get_nsproxy(tsk->nsproxy); + memcpy(ve->ve_ns->uts_ns->name.release, virt_utsname.release, + sizeof(virt_utsname.release)); + + if (cur->pid_ns->flags & PID_NS_HIDE_CHILD) + ve->ve_ns->pid_ns->flags |= PID_NS_HIDDEN; + + *old = cur; + return 0; +} + +static inline void fini_ve_namespaces(struct ve_struct *ve, + struct nsproxy *old) +{ + struct task_struct *tsk = current; + struct nsproxy *tmp; + + if (old) { + tmp = tsk->nsproxy; + tsk->nsproxy = get_nsproxy(old); + put_nsproxy(tmp); + tmp = ve->ve_ns; + ve->ve_ns = get_nsproxy(old); + put_nsproxy(tmp); + } else { + put_user_ns(ve->user_ns); + put_nsproxy(ve->ve_ns); + ve->ve_ns = NULL; + } +} + +static int init_ve_netns(struct ve_struct *ve, struct nsproxy **old) +{ + int err; + struct task_struct *tsk; + struct nsproxy *cur; + + tsk = current; + cur = tsk->nsproxy; + + err = copy_namespaces(CLONE_NEWNET, tsk, 1); + if (err < 0) + return err; + + put_nsproxy(ve->ve_ns); + ve->ve_ns = get_nsproxy(tsk->nsproxy); + ve->ve_netns = get_net(ve->ve_ns->net_ns); + *old = cur; + return 0; +} + +static inline void switch_ve_namespaces(struct ve_struct *ve, + struct task_struct *tsk) +{ + struct nsproxy *old_ns; + struct nsproxy *new_ns; + + BUG_ON(tsk != current); + old_ns = tsk->nsproxy; + new_ns = ve->ve_ns; + + if (old_ns != new_ns) { + tsk->nsproxy = get_nsproxy(new_ns); + put_nsproxy(old_ns); + } +} + +static __u64 get_ve_features(env_create_param_t *data, int datalen) +{ + __u64 known_features; + + if (datalen < sizeof(struct env_create_param3)) + /* this version of vzctl is aware of VE_FEATURES_OLD only */ + known_features = VE_FEATURES_OLD; + else + known_features = data->known_features; + + /* + * known features are set as required + * yet unknown features are set as in VE_FEATURES_DEF + */ + return (data->feature_mask & known_features) | + (VE_FEATURES_DEF & ~known_features); +} + +static int init_ve_struct(struct ve_struct *ve, envid_t veid, + u32 class_id, env_create_param_t *data, int datalen) +{ + (void)get_ve(ve); + ve->veid = veid; + ve->class_id = class_id; + ve->features = get_ve_features(data, datalen); + INIT_LIST_HEAD(&ve->vetask_lh); + init_rwsem(&ve->op_sem); + + ve->start_timespec = current->start_time; + /* The value is wrong, but it is never compared to process + * start times */ + ve->start_jiffies = get_jiffies_64(); + ve->start_cycles = get_cycles(); + + ve->_randomize_va_space = ve0._randomize_va_space; + + return 0; +} + +/********************************************************************** + ********************************************************************** + * + * /proc/meminfo virtualization + * + ********************************************************************** + **********************************************************************/ +static int ve_set_meminfo(envid_t veid, unsigned long val) +{ +#ifdef CONFIG_BEANCOUNTERS + struct ve_struct *ve; + + ve = get_ve_by_id(veid); + if (!ve) + return -EINVAL; + + if (val == 0) + val = VE_MEMINFO_SYSTEM; + else if (val == 1) + val = VE_MEMINFO_DEFAULT; + + ve->meminfo_val = val; + real_put_ve(ve); + return 0; +#else + return -ENOTTY; +#endif +} + +static int init_ve_meminfo(struct ve_struct *ve) +{ + ve->meminfo_val = VE_MEMINFO_DEFAULT; + return 0; +} + +static inline void fini_ve_meminfo(struct ve_struct *ve) +{ +} + +static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk) +{ + read_lock(&tsk->fs->lock); + ve->root_path = tsk->fs->root; + read_unlock(&tsk->fs->lock); + mark_tree_virtual(&ve->root_path); +} + +static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk) +{ + /* required for real_setdevperms from register_ve_ above */ + memcpy(&ve->ve_cap_bset, &tsk->cred->cap_effective, sizeof(kernel_cap_t)); +} + +static int ve_list_add(struct ve_struct *ve) +{ + write_lock_irq(&ve_list_lock); + if (__find_ve_by_id(ve->veid) != NULL) + goto err_exists; + + list_add(&ve->ve_list, &ve_list_head); + nr_ve++; + write_unlock_irq(&ve_list_lock); + return 0; + +err_exists: + write_unlock_irq(&ve_list_lock); + return -EEXIST; +} + +static void ve_list_del(struct ve_struct *ve) +{ + write_lock_irq(&ve_list_lock); + list_del(&ve->ve_list); + nr_ve--; + write_unlock_irq(&ve_list_lock); +} + +static void set_task_ve_caps(struct ve_struct *ve, struct cred *new) +{ + const struct cred *cur; + kernel_cap_t bset; + + bset = ve->ve_cap_bset; + cur = current_cred(); + new->cap_effective = cap_intersect(cur->cap_effective, bset); + new->cap_inheritable = cap_intersect(cur->cap_inheritable, bset); + new->cap_permitted = cap_intersect(cur->cap_permitted, bset); + new->cap_bset = cap_intersect(cur->cap_bset, bset); + + if (commit_creds(new)) + /* too late to rollback, but commit currently just works */ + BUG(); +} + +void ve_move_task(struct task_struct *tsk, struct ve_struct *new, struct cred *new_creds) +{ + struct ve_struct *old; + + might_sleep(); + BUG_ON(tsk != current); + BUG_ON(!(thread_group_leader(tsk) && thread_group_empty(tsk))); + + /* this probihibts ptracing of task entered to VE from host system */ + if (tsk->mm) + tsk->mm->vps_dumpable = 0; + /* setup capabilities before enter */ + set_task_ve_caps(new, new_creds); + + /* Drop OOM protection. */ + if (tsk->signal->oom_adj == OOM_DISABLE) + tsk->signal->oom_adj = 0; + + old = tsk->ve_task_info.owner_env; + tsk->ve_task_info.owner_env = new; + tsk->ve_task_info.exec_env = new; + + write_lock_irq(&tasklist_lock); + list_del_rcu(&tsk->ve_task_info.vetask_list); + write_unlock_irq(&tasklist_lock); + + synchronize_rcu(); + + write_lock_irq(&tasklist_lock); + list_add_tail_rcu(&tsk->ve_task_info.vetask_list, + &new->vetask_lh); + write_unlock_irq(&tasklist_lock); + + atomic_dec(&old->pcounter); + real_put_ve(old); + + atomic_inc(&new->pcounter); + get_ve(new); + + cgroup_set_task_css(tsk, new->ve_css_set); + + new->user_ns = get_user_ns(new_creds->user->user_ns); +} + +EXPORT_SYMBOL(ve_move_task); + +#ifdef CONFIG_VE_IPTABLES + +static __u64 setup_iptables_mask(__u64 init_mask) +{ + /* Remove when userspace will start supplying IPv6-related bits. */ + init_mask &= ~VE_IP_IPTABLES6; + init_mask &= ~VE_IP_FILTER6; + init_mask &= ~VE_IP_MANGLE6; + init_mask &= ~VE_IP_IPTABLE_NAT_MOD; + init_mask &= ~VE_NF_CONNTRACK_MOD; + + if (mask_ipt_allow(init_mask, VE_IP_IPTABLES)) + init_mask |= VE_IP_IPTABLES6; + if (mask_ipt_allow(init_mask, VE_IP_FILTER)) + init_mask |= VE_IP_FILTER6; + if (mask_ipt_allow(init_mask, VE_IP_MANGLE)) + init_mask |= VE_IP_MANGLE6; + if (mask_ipt_allow(init_mask, VE_IP_NAT)) + init_mask |= VE_IP_IPTABLE_NAT; + if (mask_ipt_allow(init_mask, VE_IP_CONNTRACK)) + init_mask |= VE_NF_CONNTRACK; + + return init_mask; +} + +#endif + +static inline int init_ve_cpustats(struct ve_struct *ve) +{ + ve->cpu_stats = alloc_percpu(struct ve_cpu_stats); + if (ve->cpu_stats == NULL) + return -ENOMEM; + ve->sched_lat_ve.cur = alloc_percpu(struct kstat_lat_pcpu_snap_struct); + if (ve == NULL) + goto fail; + return 0; + +fail: + free_percpu(ve->cpu_stats); + return -ENOMEM; +} + +static inline void free_ve_cpustats(struct ve_struct *ve) +{ + free_percpu(ve->cpu_stats); + ve->cpu_stats = NULL; + free_percpu(ve->sched_lat_ve.cur); + ve->sched_lat_ve.cur = NULL; +} + +static int alone_in_pgrp(struct task_struct *tsk) +{ + struct task_struct *p; + int alone = 0; + + read_lock(&tasklist_lock); + do_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p) { + if (p != tsk) + goto out; + } while_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p); + do_each_pid_task(task_pid(tsk), PIDTYPE_SID, p) { + if (p != tsk) + goto out; + } while_each_pid_task(task_pid(tsk), PIDTYPE_SID, p); + alone = 1; +out: + read_unlock(&tasklist_lock); + return alone; +} + +static int do_env_create(envid_t veid, unsigned int flags, u32 class_id, + env_create_param_t *data, int datalen) +{ + struct task_struct *tsk; + struct cred *new_creds; + struct ve_struct *old; + struct ve_struct *old_exec; + struct ve_struct *ve; + __u64 init_mask; + int err; + struct nsproxy *old_ns, *old_ns_net; + DECLARE_COMPLETION_ONSTACK(sysfs_completion); + + tsk = current; + old = VE_TASK_INFO(tsk)->owner_env; + + if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) + return -EINVAL; + + if (tsk->signal->tty) { + printk("ERR: CT init has controlling terminal\n"); + return -EINVAL; + } + if (task_pgrp(tsk) != task_pid(tsk) || + task_session(tsk) != task_pid(tsk)) { + int may_setsid; + + read_lock(&tasklist_lock); + may_setsid = !tsk->signal->leader && + !pid_task(find_pid_ns(task_pid_nr(tsk), &init_pid_ns), PIDTYPE_PGID); + read_unlock(&tasklist_lock); + + if (!may_setsid) { + printk("ERR: CT init is process group leader\n"); + return -EINVAL; + } + } + /* Check that the process is not a leader of non-empty group/session. + * If it is, we cannot virtualize its PID and must fail. */ + if (!alone_in_pgrp(tsk)) { + printk("ERR: CT init is not alone in process group\n"); + return -EINVAL; + } + + + VZTRACE("%s: veid=%d classid=%d pid=%d\n", + __FUNCTION__, veid, class_id, current->pid); + + err = -ENOMEM; + ve = kzalloc(sizeof(struct ve_struct), GFP_KERNEL); + if (ve == NULL) + goto err_struct; + + init_ve_struct(ve, veid, class_id, data, datalen); + __module_get(THIS_MODULE); + down_write(&ve->op_sem); + if (flags & VE_LOCK) + ve->is_locked = 1; + + /* + * this should be done before adding to list + * because if calc_load_ve finds this ve in + * list it will be very surprised + */ + if ((err = init_ve_cpustats(ve)) < 0) + goto err_cpu_stats; + + if ((err = ve_list_add(ve)) < 0) + goto err_exist; + + /* this should be done before context switching */ + if ((err = init_printk(ve)) < 0) + goto err_log_wait; + + old_exec = set_exec_env(ve); + + if ((err = init_ve_sched(ve)) < 0) + goto err_sched; + + set_ve_root(ve, tsk); + + if ((err = init_ve_sysfs(ve))) + goto err_sysfs; + + if ((err = init_ve_namespaces(ve, &old_ns))) + goto err_ns; + + if ((err = init_ve_proc(ve))) + goto err_proc; + + + init_mask = data ? data->iptables_mask : VE_IP_DEFAULT; + +#ifdef CONFIG_VE_IPTABLES + /* Set up ipt_mask as it will be used during + * net namespace initialization + */ + init_mask = setup_iptables_mask(init_mask); + ve->ipt_mask = init_mask; +#endif + + if ((err = init_ve_netns(ve, &old_ns_net))) + goto err_netns; + + if ((err = init_ve_cgroups(ve))) + goto err_cgroup; + + if ((err = init_ve_tty_drivers(ve)) < 0) + goto err_tty; + + if ((err = init_ve_shmem(ve))) + goto err_shmem; + + if ((err = init_ve_devpts(ve))) + goto err_devpts; + + if((err = init_ve_meminfo(ve))) + goto err_meminf; + + set_ve_caps(ve, tsk); + + if ((err = pid_ns_attach_init(ve->ve_ns->pid_ns, tsk)) < 0) + goto err_vpid; + + new_creds = prepare_creds(); + if (new_creds == NULL) + goto err_creds; + + if ((err = create_user_ns(new_creds)) < 0) + goto err_uns; + + if ((err = ve_hook_iterate_init(VE_SS_CHAIN, ve)) < 0) + goto err_ve_hook; + + put_nsproxy(old_ns); + put_nsproxy(old_ns_net); + + /* finally: set vpids and move inside */ + ve_move_task(tsk, ve, new_creds); + + ve->is_running = 1; + up_write(&ve->op_sem); + + printk(KERN_INFO "CT: %d: started\n", veid); + return veid; + +err_ve_hook: + /* creds will put user and user ns */ +err_uns: + abort_creds(new_creds); +err_creds: + mntget(ve->proc_mnt); +err_vpid: + fini_venet(ve); + fini_ve_meminfo(ve); +err_meminf: + fini_ve_devpts(ve); +err_devpts: + fini_ve_shmem(ve); +err_shmem: + fini_ve_tty_drivers(ve); +err_tty: + fini_ve_cgroups(ve); +err_cgroup: + fini_ve_namespaces(ve, old_ns_net); + put_nsproxy(old_ns_net); + ve->ve_netns->sysfs_completion = &sysfs_completion; + put_net(ve->ve_netns); + wait_for_completion(&sysfs_completion); +err_netns: + /* + * If process hasn't become VE's init, proc_mnt won't be put during + * pidns death, so this mntput by hand is needed. If it has, we + * compensate with mntget above. + */ + mntput(ve->proc_mnt); + fini_ve_proc(ve); +err_proc: + /* free_ve_utsname() is called inside real_put_ve() */ + fini_ve_namespaces(ve, old_ns); + put_nsproxy(old_ns); + /* + * We need to compensate, because fini_ve_namespaces() assumes + * ve->ve_ns will continue to be used after, but VE will be freed soon + * (in kfree() sense). + */ + put_nsproxy(ve->ve_ns); +err_ns: + fini_ve_sysfs(ve); +err_sysfs: + /* It is safe to restore current->envid here because + * ve_fairsched_detach does not use current->envid. */ + /* Really fairsched code uses current->envid in sys_fairsched_mknod + * only. It is correct if sys_fairsched_mknod is called from + * userspace. If sys_fairsched_mknod is called from + * ve_fairsched_attach, then node->envid and node->parent_node->envid + * are explicitly set to valid value after the call. */ + /* FIXME */ + VE_TASK_INFO(tsk)->owner_env = old; + VE_TASK_INFO(tsk)->exec_env = old_exec; + + fini_ve_sched(ve); +err_sched: + (void)set_exec_env(old_exec); + + /* we can jump here having incorrect envid */ + VE_TASK_INFO(tsk)->owner_env = old; + fini_printk(ve); +err_log_wait: + /* cpustats will be freed in do_env_free */ + ve_list_del(ve); + up_write(&ve->op_sem); + + real_put_ve(ve); +err_struct: + printk(KERN_INFO "CT: %d: failed to start with err=%d\n", veid, err); + return err; + +err_exist: + free_ve_cpustats(ve); +err_cpu_stats: + kfree(ve); + module_put(THIS_MODULE); + goto err_struct; +} + + +/********************************************************************** + ********************************************************************** + * + * VE start/stop callbacks + * + ********************************************************************** + **********************************************************************/ + +int real_env_create(envid_t veid, unsigned flags, u32 class_id, + env_create_param_t *data, int datalen) +{ + int status; + struct ve_struct *ve; + + if (!flags) { + status = get_exec_env()->veid; + goto out; + } + + status = -EPERM; + if (!capable_setveid()) + goto out; + + status = -EINVAL; + if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE))) + goto out; + + status = -EINVAL; + ve = get_ve_by_id(veid); + if (ve) { + if (flags & VE_TEST) { + status = 0; + goto out_put; + } + if (flags & VE_EXCLUSIVE) { + status = -EACCES; + goto out_put; + } + if (flags & VE_CREATE) { + flags &= ~VE_CREATE; + flags |= VE_ENTER; + } + } else { + if (flags & (VE_TEST|VE_ENTER)) { + status = -ESRCH; + goto out; + } + } + + if (flags & VE_CREATE) { + status = do_env_create(veid, flags, class_id, data, datalen); + goto out; + } else if (flags & VE_ENTER) + status = do_env_enter(ve, flags); + + /* else: returning EINVAL */ + +out_put: + real_put_ve(ve); +out: + return status; +} +EXPORT_SYMBOL(real_env_create); + +static int do_env_enter(struct ve_struct *ve, unsigned int flags) +{ + struct task_struct *tsk = current; + struct cred *new_creds; + int err; + + VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid); + + err = -EBUSY; + down_read(&ve->op_sem); + if (!ve->is_running) + goto out_up; + if (ve->is_locked && !(flags & VE_SKIPLOCK)) + goto out_up; + err = -EINVAL; + if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) + goto out_up; + + new_creds = prepare_creds(); + if (new_creds == NULL) + goto out_up; + +#ifdef CONFIG_VZ_FAIRSCHED + err = sys_fairsched_mvpr(task_pid_vnr(current), ve->veid); + if (err) { + abort_creds(new_creds); + goto out_up; + } +#endif + ve_sched_attach(ve); + switch_ve_namespaces(ve, tsk); + ve_move_task(current, ve, new_creds); + + /* Check that the process is not a leader of non-empty group/session. + * If it is, we cannot virtualize its PID. Do not fail, just leave + * it non-virtual. + */ + if (alone_in_pgrp(tsk) && !(flags & VE_SKIPLOCK)) + pid_ns_attach_task(ve->ve_ns->pid_ns, tsk); + + /* Unlike VE_CREATE, we do not setsid() in VE_ENTER. + * Process is allowed to be in an external group/session. + * If user space callers wants, it will do setsid() after + * VE_ENTER. + */ + err = VE_TASK_INFO(tsk)->owner_env->veid; + tsk->did_ve_enter = 1; + +out_up: + up_read(&ve->op_sem); + return err; +} + +static void env_cleanup(struct ve_struct *ve) +{ + struct ve_struct *old_ve; + DECLARE_COMPLETION_ONSTACK(sysfs_completion); + + VZTRACE("real_do_env_cleanup\n"); + + down_read(&ve->op_sem); + old_ve = set_exec_env(ve); + + ve_hook_iterate_fini(VE_SS_CHAIN, ve); + + fini_venet(ve); + + /* no new packets in flight beyond this point */ + + fini_ve_sched(ve); + + fini_ve_devpts(ve); + fini_ve_shmem(ve); + unregister_ve_tty_drivers(ve); + fini_ve_meminfo(ve); + + fini_ve_cgroups(ve); + + fini_ve_namespaces(ve, NULL); + ve->ve_netns->sysfs_completion = &sysfs_completion; + put_net(ve->ve_netns); + wait_for_completion(&sysfs_completion); + fini_ve_proc(ve); + fini_ve_sysfs(ve); + + (void)set_exec_env(old_ve); + fini_printk(ve); /* no printk can happen in ve context anymore */ + + ve_list_del(ve); + up_read(&ve->op_sem); + + real_put_ve(ve); +} + +static DECLARE_COMPLETION(vzmond_complete); +static int vzmond_helper(void *arg) +{ + char name[18]; + struct ve_struct *ve; + + ve = (struct ve_struct *)arg; + snprintf(name, sizeof(name), "vzmond/%d", ve->veid); + daemonize(name); + env_cleanup(ve); + module_put_and_exit(0); +} + +static void do_pending_env_cleanups(void) +{ + int err; + struct ve_struct *ve; + + spin_lock(&ve_cleanup_lock); + while (1) { + if (list_empty(&ve_cleanup_list) || need_resched()) + break; + + ve = list_first_entry(&ve_cleanup_list, + struct ve_struct, cleanup_list); + list_del(&ve->cleanup_list); + spin_unlock(&ve_cleanup_lock); + + __module_get(THIS_MODULE); + err = kernel_thread(vzmond_helper, (void *)ve, 0); + if (err < 0) { + env_cleanup(ve); + module_put(THIS_MODULE); + } + + spin_lock(&ve_cleanup_lock); + } + spin_unlock(&ve_cleanup_lock); +} + +static inline int have_pending_cleanups(void) +{ + return !list_empty(&ve_cleanup_list); +} + +static int vzmond(void *arg) +{ + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop() || have_pending_cleanups()) { + schedule(); + try_to_freeze(); + if (signal_pending(current)) + flush_signals(current); + + do_pending_env_cleanups(); + set_current_state(TASK_INTERRUPTIBLE); + if (have_pending_cleanups()) + __set_current_state(TASK_RUNNING); + } + + __set_task_state(current, TASK_RUNNING); + complete_and_exit(&vzmond_complete, 0); +} + +static int __init init_vzmond(void) +{ + ve_cleanup_thread = kthread_run(vzmond, NULL, "vzmond"); + if (IS_ERR(ve_cleanup_thread)) + return PTR_ERR(ve_cleanup_thread); + else + return 0; +} + +static void fini_vzmond(void) +{ + kthread_stop(ve_cleanup_thread); + WARN_ON(!list_empty(&ve_cleanup_list)); +} + +static void real_do_env_free(struct ve_struct *ve) +{ + VZTRACE("real_do_env_free\n"); + + free_ve_tty_drivers(ve); + free_ve_filesystems(ve); + free_ve_cpustats(ve); + printk(KERN_INFO "CT: %d: stopped\n", VEID(ve)); + kfree(ve); + + module_put(THIS_MODULE); +} + +/********************************************************************** + ********************************************************************** + * + * VE TTY handling + * + ********************************************************************** + **********************************************************************/ + +static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base, + struct ve_struct *ve) +{ + size_t size; + struct tty_driver *driver; + + /* FIXME: make it a normal way (or wait till ms version) */ + + driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL_UBC); + if (!driver) + goto out; + + memcpy(driver, base, sizeof(struct tty_driver)); + + driver->driver_state = NULL; + + size = base->num * 3 * sizeof(void *); + if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { + void **p; + p = kzalloc(size, GFP_KERNEL_UBC); + if (!p) + goto out_free; + + driver->ttys = (struct tty_struct **)p; + driver->termios = (struct ktermios **)(p + driver->num); + driver->termios_locked = (struct ktermios **) + (p + driver->num * 2); + } else { + driver->ttys = NULL; + driver->termios = NULL; + driver->termios_locked = NULL; + } + + driver->owner_env = ve; + driver->flags |= TTY_DRIVER_INSTALLED; + kref_init(&driver->kref); + + return driver; + +out_free: + kfree(driver); +out: + return NULL; +} + +static void free_ve_tty_driver(struct tty_driver *driver) +{ + if (!driver) + return; + + clear_termios(driver); + kfree(driver->ttys); + kfree(driver); +} + +static int alloc_ve_tty_drivers(struct ve_struct* ve) +{ +#ifdef CONFIG_LEGACY_PTYS + /* Traditional BSD devices */ + ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve); + if (!ve->pty_driver) + goto out_mem; + + ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve); + if (!ve->pty_slave_driver) + goto out_mem; + + ve->pty_driver->other = ve->pty_slave_driver; + ve->pty_slave_driver->other = ve->pty_driver; +#endif + +#ifdef CONFIG_UNIX98_PTYS + ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve); + if (!ve->ptm_driver) + goto out_mem; + + ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve); + if (!ve->pts_driver) + goto out_mem; + + ve->ptm_driver->other = ve->pts_driver; + ve->pts_driver->other = ve->ptm_driver; + + ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys), + GFP_KERNEL_UBC); + if (!ve->allocated_ptys) + goto out_mem; + ida_init(ve->allocated_ptys); +#endif + return 0; + +out_mem: + free_ve_tty_drivers(ve); + return -ENOMEM; +} + +static void free_ve_tty_drivers(struct ve_struct* ve) +{ +#ifdef CONFIG_LEGACY_PTYS + free_ve_tty_driver(ve->pty_driver); + free_ve_tty_driver(ve->pty_slave_driver); + ve->pty_driver = ve->pty_slave_driver = NULL; +#endif +#ifdef CONFIG_UNIX98_PTYS + free_ve_tty_driver(ve->ptm_driver); + free_ve_tty_driver(ve->pts_driver); + if (ve->allocated_ptys) + ida_destroy(ve->allocated_ptys); + kfree(ve->allocated_ptys); + ve->ptm_driver = ve->pts_driver = NULL; + ve->allocated_ptys = NULL; +#endif +} + +static inline void __register_tty_driver(struct tty_driver *driver) +{ + list_add(&driver->tty_drivers, &tty_drivers); +} + +static inline void __unregister_tty_driver(struct tty_driver *driver) +{ + if (!driver) + return; + list_del(&driver->tty_drivers); +} + +static int register_ve_tty_drivers(struct ve_struct* ve) +{ + mutex_lock(&tty_mutex); +#ifdef CONFIG_UNIX98_PTYS + __register_tty_driver(ve->ptm_driver); + __register_tty_driver(ve->pts_driver); +#endif +#ifdef CONFIG_LEGACY_PTYS + __register_tty_driver(ve->pty_driver); + __register_tty_driver(ve->pty_slave_driver); +#endif + mutex_unlock(&tty_mutex); + + return 0; +} + +static void unregister_ve_tty_drivers(struct ve_struct* ve) +{ + VZTRACE("unregister_ve_tty_drivers\n"); + + mutex_lock(&tty_mutex); +#ifdef CONFIG_LEGACY_PTYS + __unregister_tty_driver(ve->pty_driver); + __unregister_tty_driver(ve->pty_slave_driver); +#endif +#ifdef CONFIG_UNIX98_PTYS + __unregister_tty_driver(ve->ptm_driver); + __unregister_tty_driver(ve->pts_driver); +#endif + mutex_unlock(&tty_mutex); +} + +static int init_ve_tty_drivers(struct ve_struct *ve) +{ + int err; + + if ((err = alloc_ve_tty_drivers(ve))) + goto err_ttyalloc; + if ((err = register_ve_tty_drivers(ve))) + goto err_ttyreg; + return 0; + +err_ttyreg: + free_ve_tty_drivers(ve); +err_ttyalloc: + return err; +} + +static void fini_ve_tty_drivers(struct ve_struct *ve) +{ + unregister_ve_tty_drivers(ve); + free_ve_tty_drivers(ve); +} + +/* + * Free the termios and termios_locked structures because + * we don't want to get memory leaks when modular tty + * drivers are removed from the kernel. + */ +static void clear_termios(struct tty_driver *driver) +{ + int i; + struct ktermios *tp; + + if (driver->termios == NULL) + return; + for (i = 0; i < driver->num; i++) { + tp = driver->termios[i]; + if (tp) { + driver->termios[i] = NULL; + kfree(tp); + } + tp = driver->termios_locked[i]; + if (tp) { + driver->termios_locked[i] = NULL; + kfree(tp); + } + } +} + + +/********************************************************************** + ********************************************************************** + * + * Pieces of VE network + * + ********************************************************************** + **********************************************************************/ + +#ifdef CONFIG_NET +#include +#include +#include +#include +#include +#include +#endif + +static int ve_dev_add(envid_t veid, char *dev_name) +{ + struct net_device *dev; + struct ve_struct *dst_ve; + struct net *dst_net; + int err = -ESRCH; + + dst_ve = get_ve_by_id(veid); + if (dst_ve == NULL) + goto out; + + dst_net = dst_ve->ve_netns; + + rtnl_lock(); + read_lock(&dev_base_lock); + dev = __dev_get_by_name(&init_net, dev_name); + read_unlock(&dev_base_lock); + if (dev == NULL) + goto out_unlock; + + err = __dev_change_net_namespace(dev, dst_net, dev_name, get_exec_ub()); +out_unlock: + rtnl_unlock(); + real_put_ve(dst_ve); + + if (dev == NULL) + printk(KERN_WARNING "%s: device %s not found\n", + __func__, dev_name); +out: + return err; +} + +static int ve_dev_del(envid_t veid, char *dev_name) +{ + struct net_device *dev; + struct ve_struct *src_ve; + struct net *src_net; + int err = -ESRCH; + + src_ve = get_ve_by_id(veid); + if (src_ve == NULL) + goto out; + + src_net = src_ve->ve_netns; + + rtnl_lock(); + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(src_net, dev_name); + read_unlock(&dev_base_lock); + if (dev == NULL) + goto out_unlock; + + err = __dev_change_net_namespace(dev, &init_net, dev_name, + netdev_bc(dev)->owner_ub); +out_unlock: + rtnl_unlock(); + real_put_ve(src_ve); + + if (dev == NULL) + printk(KERN_WARNING "%s: device %s not found\n", + __func__, dev_name); +out: + return err; +} + +int real_ve_dev_map(envid_t veid, int op, char *dev_name) +{ + if (!capable_setveid()) + return -EPERM; + switch (op) { + case VE_NETDEV_ADD: + return ve_dev_add(veid, dev_name); + case VE_NETDEV_DEL: + return ve_dev_del(veid, dev_name); + default: + return -EINVAL; + } +} + +/********************************************************************** + ********************************************************************** + * + * VE information via /proc + * + ********************************************************************** + **********************************************************************/ +#ifdef CONFIG_PROC_FS +#if BITS_PER_LONG == 32 +#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21) +#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n" +#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n" +#else +#define VESTAT_LINE_WIDTH (12 * 21) +#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n" +#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n" +#endif + +static int vestat_seq_show(struct seq_file *m, void *v) +{ + struct list_head *entry; + struct ve_struct *ve; + struct ve_struct *curve; + int cpu; + unsigned long user_ve, nice_ve, system_ve; + unsigned long long uptime; + cycles_t uptime_cycles, idle_time, strv_time, used; + + entry = (struct list_head *)v; + ve = list_entry(entry, struct ve_struct, ve_list); + + curve = get_exec_env(); + if (entry == ve_list_head.next || + (!ve_is_super(curve) && ve == curve)) { + /* print header */ + seq_printf(m, "%-*s\n", + VESTAT_LINE_WIDTH - 1, + "Version: 2.2"); + seq_printf(m, VESTAT_HEAD_FMT, "VEID", + "user", "nice", "system", + "uptime", "idle", + "strv", "uptime", "used", + "maxlat", "totlat", "numsched"); + } + + if (ve == get_ve0()) + return 0; + + user_ve = nice_ve = system_ve = 0; + idle_time = strv_time = used = 0; + + for_each_online_cpu(cpu) { + struct ve_cpu_stats *st; + + st = VE_CPU_STATS(ve, cpu); + user_ve += st->user; + nice_ve += st->nice; + system_ve += st->system; + used += st->used_time; + idle_time += ve_sched_get_idle_time(ve, cpu); + } + uptime_cycles = get_cycles() - ve->start_cycles; + uptime = get_jiffies_64() - ve->start_jiffies; + + seq_printf(m, VESTAT_LINE_FMT, ve->veid, + user_ve, nice_ve, system_ve, + (unsigned long long)uptime, + (unsigned long long)idle_time, + (unsigned long long)strv_time, + (unsigned long long)uptime_cycles, + (unsigned long long)used, + (unsigned long long)ve->sched_lat_ve.last.maxlat, + (unsigned long long)ve->sched_lat_ve.last.totlat, + ve->sched_lat_ve.last.count); + return 0; +} + +void *ve_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ve_struct *curve; + + curve = get_exec_env(); + read_lock(&ve_list_lock); + if (!ve_is_super(curve)) { + if (*pos != 0) + return NULL; + return curve; + } + + return seq_list_start(&ve_list_head, *pos); +} +EXPORT_SYMBOL(ve_seq_start); + +void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (!ve_is_super(get_exec_env())) + return NULL; + else + return seq_list_next(v, &ve_list_head, pos); +} +EXPORT_SYMBOL(ve_seq_next); + +void ve_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_list_lock); +} +EXPORT_SYMBOL(ve_seq_stop); + +static struct seq_operations vestat_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = vestat_seq_show +}; + +static int vestat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vestat_seq_op); +} + +static struct file_operations proc_vestat_operations = { + .open = vestat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static struct seq_operations devperms_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = devperms_seq_show, +}; + +static int devperms_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &devperms_seq_op); +} + +static struct file_operations proc_devperms_ops = { + .open = devperms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int vz_version_show(struct seq_file *file, void* v) +{ + static const char ver[] = VZVERSION "\n"; + + return seq_puts(file, ver); +} + +static int vz_version_open(struct inode *inode, struct file *file) +{ + return single_open(file, vz_version_show, NULL); +} + +static struct file_operations proc_vz_version_oparations = { + .open = vz_version_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static inline unsigned long ve_used_mem(struct user_beancounter *ub) +{ + extern int glob_ve_meminfo; + return glob_ve_meminfo ? ub->ub_parms[UB_OOMGUARPAGES].held : + ub->ub_parms[UB_PRIVVMPAGES].held ; +} + +static void ve_swapinfo(struct sysinfo *val, struct user_beancounter *ub) +{ + unsigned long size, used; + + size = ub->ub_parms[UB_SWAPPAGES].limit; + used = ub->ub_parms[UB_SWAPPAGES].held; + + if (size == UB_MAXVALUE) + size = 0; + + val->totalswap = size; + val->freeswap = size > used ? size - used : 0; +} + +static inline int ve_mi_replace(struct meminfo *mi, int old_ret) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + unsigned long meminfo_val; + unsigned long nodettram; + unsigned long usedmem; + + meminfo_val = get_exec_env()->meminfo_val; + if (meminfo_val == VE_MEMINFO_DEFAULT) + return old_ret; /* Default behaviour */ + + if (meminfo_val == VE_MEMINFO_SYSTEM) + return NOTIFY_DONE | NOTIFY_STOP_MASK; /* No virtualization */ + + nodettram = mi->si.totalram; + ub = top_beancounter(current->mm->mm_ub); + usedmem = ve_used_mem(ub); + + memset(mi, 0, sizeof(*mi)); + + mi->si.totalram = (meminfo_val > nodettram) ? + nodettram : meminfo_val; + mi->si.freeram = (mi->si.totalram > usedmem) ? + (mi->si.totalram - usedmem) : 0; + + ve_swapinfo(&mi->si, ub); + + return NOTIFY_OK | NOTIFY_STOP_MASK; +#else + return NOTIFY_DONE; +#endif +} + +static int meminfo_call(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + if (event != VIRTINFO_MEMINFO) + return old_ret; + + return ve_mi_replace((struct meminfo *)arg, old_ret); +} + + +static struct vnotifier_block meminfo_notifier_block = { + .notifier_call = meminfo_call +}; + +/* /proc/vz/veinfo */ + +static ve_seq_print_t veaddr_seq_print_cb; + +void vzmon_register_veaddr_print_cb(ve_seq_print_t cb) +{ + rcu_assign_pointer(veaddr_seq_print_cb, cb); +} +EXPORT_SYMBOL(vzmon_register_veaddr_print_cb); + +void vzmon_unregister_veaddr_print_cb(ve_seq_print_t cb) +{ + rcu_assign_pointer(veaddr_seq_print_cb, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL(vzmon_unregister_veaddr_print_cb); + +static int veinfo_seq_show(struct seq_file *m, void *v) +{ + struct ve_struct *ve; + ve_seq_print_t veaddr_seq_print; + + ve = list_entry((struct list_head *)v, struct ve_struct, ve_list); + + seq_printf(m, "%10u %5u %5u", ve->veid, + ve->class_id, atomic_read(&ve->pcounter)); + + rcu_read_lock(); + veaddr_seq_print = rcu_dereference(veaddr_seq_print_cb); + if (veaddr_seq_print) + veaddr_seq_print(m, ve); + rcu_read_unlock(); + + seq_putc(m, '\n'); + return 0; +} + +static struct seq_operations veinfo_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = veinfo_seq_show, +}; + +static int veinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &veinfo_seq_op); +} + +static struct file_operations proc_veinfo_operations = { + .open = veinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init init_vecalls_proc(void) +{ + struct proc_dir_entry *de; + + de = proc_create("vestat", S_IFREG | S_IRUSR, proc_vz_dir, + &proc_vestat_operations); + if (!de) + printk(KERN_WARNING "VZMON: can't make vestat proc entry\n"); + + de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir, + &proc_devperms_ops); + if (!de) + printk(KERN_WARNING "VZMON: can't make devperms proc entry\n"); + + de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir, + &proc_vz_version_oparations); + if (!de) + printk(KERN_WARNING "VZMON: can't make version proc entry\n"); + + de = proc_create("veinfo", S_IFREG | S_IRUSR, proc_vz_dir, + &proc_veinfo_operations); + if (!de) + printk(KERN_WARNING "VZMON: can't make veinfo proc entry\n"); + + virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block); + return 0; +} + +static void fini_vecalls_proc(void) +{ + remove_proc_entry("version", proc_vz_dir); + remove_proc_entry("devperms", proc_vz_dir); + remove_proc_entry("vestat", proc_vz_dir); + remove_proc_entry("veinfo", proc_vz_dir); + virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block); +} +#else +#define init_vecalls_proc() (0) +#define fini_vecalls_proc() do { } while (0) +#endif /* CONFIG_PROC_FS */ + + +/********************************************************************** + ********************************************************************** + * + * User ctl + * + ********************************************************************** + **********************************************************************/ + +int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VZCTL_MARK_ENV_TO_DOWN: { + /* Compatibility issue */ + err = 0; + } + break; + case VZCTL_SETDEVPERMS: { + /* Device type was mistakenly declared as dev_t + * in the old user-kernel interface. + * That's wrong, dev_t is a kernel internal type. + * I use `unsigned' not having anything better in mind. + * 2001/08/11 SAW */ + struct vzctl_setdevperms s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_setdevperms(s.veid, s.type, + new_decode_dev(s.dev), s.mask); + } + break; +#ifdef CONFIG_INET + case VZCTL_VE_NETDEV: { + struct vzctl_ve_netdev d; + char *s; + err = -EFAULT; + if (copy_from_user(&d, (void __user *)arg, sizeof(d))) + break; + err = -ENOMEM; + s = kmalloc(IFNAMSIZ+1, GFP_KERNEL); + if (s == NULL) + break; + err = -EFAULT; + if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) { + s[IFNAMSIZ] = 0; + err = real_ve_dev_map(d.veid, d.op, s); + } + kfree(s); + } + break; +#endif + case VZCTL_ENV_CREATE: { + struct vzctl_env_create s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_env_create(s.veid, s.flags, s.class_id, + NULL, 0); + } + break; + case VZCTL_ENV_CREATE_DATA: { + struct vzctl_env_create_data s; + env_create_param_t *data; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err=-EINVAL; + if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN || + s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN || + s.data == 0) + break; + err = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + break; + + err = -EFAULT; + if (copy_from_user(data, (void __user *)s.data, + s.datalen)) + goto free_data; + err = real_env_create(s.veid, s.flags, s.class_id, + data, s.datalen); +free_data: + kfree(data); + } + break; + case VZCTL_GET_CPU_STAT: { + struct vzctl_cpustatctl s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = ve_get_cpu_stat(s.veid, s.cpustat); + } + break; + case VZCTL_VE_MEMINFO: { + struct vzctl_ve_meminfo s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = ve_set_meminfo(s.veid, s.val); + } + break; + } + return err; +} + +#ifdef CONFIG_COMPAT +int compat_vzcalls_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err; + + switch(cmd) { + case VZCTL_GET_CPU_STAT: { + /* FIXME */ + } + case VZCTL_COMPAT_ENV_CREATE_DATA: { + struct compat_vzctl_env_create_data cs; + struct vzctl_env_create_data __user *s; + + s = compat_alloc_user_space(sizeof(*s)); + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + if (put_user(cs.veid, &s->veid) || + put_user(cs.flags, &s->flags) || + put_user(cs.class_id, &s->class_id) || + put_user(compat_ptr(cs.data), &s->data) || + put_user(cs.datalen, &s->datalen)) + break; + err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA, + (unsigned long)s); + break; + } +#ifdef CONFIG_NET + case VZCTL_COMPAT_VE_NETDEV: { + struct compat_vzctl_ve_netdev cs; + struct vzctl_ve_netdev __user *s; + + s = compat_alloc_user_space(sizeof(*s)); + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + if (put_user(cs.veid, &s->veid) || + put_user(cs.op, &s->op) || + put_user(compat_ptr(cs.dev_name), &s->dev_name)) + break; + err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s); + break; + } +#endif + case VZCTL_COMPAT_VE_MEMINFO: { + struct compat_vzctl_ve_meminfo cs; + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + err = ve_set_meminfo(cs.veid, cs.val); + break; + } + default: + err = vzcalls_ioctl(file, cmd, arg); + break; + } + return err; +} +#endif + +static struct vzioctlinfo vzcalls = { + .type = VZCTLTYPE, + .ioctl = vzcalls_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_vzcalls_ioctl, +#endif + .owner = THIS_MODULE, +}; + + +/********************************************************************** + ********************************************************************** + * + * Init/exit stuff + * + ********************************************************************** + **********************************************************************/ + +static inline __init int init_vecalls_ioctls(void) +{ + vzioctl_register(&vzcalls); + return 0; +} + +static inline void fini_vecalls_ioctls(void) +{ + vzioctl_unregister(&vzcalls); +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *table_header; + +static ctl_table kernel_table[] = { + { + .procname = "ve_allow_kthreads", + .data = &ve_allow_kthreads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0 } +}; + +static ctl_table root_table[] = { + {CTL_KERN, "kernel", NULL, 0, 0555, kernel_table}, + { 0 } +}; + +static int init_vecalls_sysctl(void) +{ + table_header = register_sysctl_table(root_table); + if (!table_header) + return -ENOMEM ; + return 0; +} + +static void fini_vecalls_sysctl(void) +{ + unregister_sysctl_table(table_header); +} +#else +static int init_vecalls_sysctl(void) { return 0; } +static void fini_vecalls_sysctl(void) { ; } +#endif + +static int __init vecalls_init(void) +{ + int err; + + err = init_vecalls_sysctl(); + if (err) + goto out_vzmond; + + err = init_vzmond(); + if (err < 0) + goto out_sysctl; + + err = init_vecalls_proc(); + if (err < 0) + goto out_proc; + + err = init_vecalls_ioctls(); + if (err < 0) + goto out_ioctls; + + /* We can easy dereference this hook if VE is running + * because in this case vzmon refcount > 0 + */ + do_ve_enter_hook = do_env_enter; + /* + * This one can also be dereferenced since not freed + * VE holds reference on module + */ + do_env_free_hook = real_do_env_free; + + return 0; + +out_ioctls: + fini_vecalls_proc(); +out_proc: + fini_vzmond(); +out_sysctl: + fini_vecalls_sysctl(); +out_vzmond: + return err; +} + +static void vecalls_exit(void) +{ + do_env_free_hook = NULL; + do_ve_enter_hook = NULL; + fini_vecalls_ioctls(); + fini_vecalls_proc(); + fini_vzmond(); + fini_vecalls_sysctl(); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Control"); +MODULE_LICENSE("GPL v2"); + +module_init(vecalls_init) +module_exit(vecalls_exit) diff -urNp linux-2.6.32.48/kernel/ve/veowner.c linux-2.6.32.48-openvz/kernel/ve/veowner.c --- linux-2.6.32.48/kernel/ve/veowner.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ve/veowner.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,160 @@ +/* + * kernel/ve/veowner.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +void prepare_ve0_process(struct task_struct *tsk) +{ + VE_TASK_INFO(tsk)->exec_env = get_ve0(); + VE_TASK_INFO(tsk)->owner_env = get_ve0(); + VE_TASK_INFO(tsk)->sleep_time = 0; + VE_TASK_INFO(tsk)->wakeup_stamp = 0; + VE_TASK_INFO(tsk)->sched_time = 0; + seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock); + + if (tsk->pid) { + list_add_rcu(&tsk->ve_task_info.vetask_list, + &get_ve0()->vetask_lh); + atomic_inc(&get_ve0()->pcounter); + } +} + +/* + * ------------------------------------------------------------------------ + * proc entries + * ------------------------------------------------------------------------ + */ + +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_vz_dir; +EXPORT_SYMBOL(proc_vz_dir); + +struct proc_dir_entry *glob_proc_vz_dir; +EXPORT_SYMBOL(glob_proc_vz_dir); + +static void prepare_proc(void) +{ + proc_vz_dir = proc_mkdir("vz", NULL); + if (!proc_vz_dir) + panic("Can't create /proc/vz dir\n"); + + glob_proc_vz_dir = proc_mkdir("vz", &glob_proc_root); + if (!proc_vz_dir) + panic("Can't create /proc/vz dir\n"); +} +#endif + +/* + * ------------------------------------------------------------------------ + * OpenVZ sysctl + * ------------------------------------------------------------------------ + */ +int ve_xattr_policy = VE_XATTR_POLICY_ACCEPT; +extern int ve_area_access_check; + +#ifdef CONFIG_INET +static struct ctl_table vz_ipv4_route_table[] = { + { + .procname = "src_check", + .data = &ip_rt_src_check, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { 0 } +}; + +static struct ctl_path net_ipv4_route_path[] = { + { .ctl_name = CTL_NET, .procname = "net", }, + { .ctl_name = NET_IPV4, .procname = "ipv4", }, + { .ctl_name = NET_IPV4_ROUTE, .procname = "route", }, + { } +}; +#endif + +static struct ctl_table vz_fs_table[] = { + { + .procname = "ve-area-access-check", + .data = &ve_area_access_check, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ve-xattr-policy", + .data = &ve_xattr_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0 } +}; + +static struct ctl_path fs_path[] = { + { .ctl_name = CTL_FS, .procname = "fs", }, + { } +}; + +static void prepare_sysctl(void) +{ +#ifdef CONFIG_INET + register_sysctl_paths(net_ipv4_route_path, vz_ipv4_route_table); +#endif + register_sysctl_paths(fs_path, vz_fs_table); +} + +/* + * ------------------------------------------------------------------------ + * XXX init_ve_system + * ------------------------------------------------------------------------ + */ + +void init_ve_system(void) +{ + struct task_struct *init_entry; + struct ve_struct *ve; + + ve = get_ve0(); + + init_entry = init_pid_ns.child_reaper; + /* if ve_move_task to VE0 (e.g. in cpt code) * + * occurs, ve_cap_bset on VE0 is required */ + ve->ve_cap_bset = CAP_INIT_EFF_SET; + + read_lock(&init_entry->fs->lock); + ve->root_path = init_entry->fs->root; + read_unlock(&init_entry->fs->lock); + +#ifdef CONFIG_PROC_FS + prepare_proc(); +#endif + prepare_sysctl(); +} diff -urNp linux-2.6.32.48/kernel/ve/vzdev.c linux-2.6.32.48-openvz/kernel/ve/vzdev.c --- linux-2.6.32.48/kernel/ve/vzdev.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ve/vzdev.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,154 @@ +/* + * kernel/ve/vzdev.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VZCTL_MAJOR 126 +#define VZCTL_NAME "vzctl" + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Interface"); +MODULE_LICENSE("GPL v2"); + +static LIST_HEAD(ioctls); +static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED; + +static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd) +{ + struct vzioctlinfo *h; + + spin_lock(&ioctl_lock); + list_for_each_entry(h, &ioctls, list) { + if (h->type == _IOC_TYPE(cmd)) + goto found; + } + h = NULL; +found: + if (h && !try_module_get(h->owner)) + h = NULL; + spin_unlock(&ioctl_lock); + return h; +} + +static void vzctl_put_handler(struct vzioctlinfo *h) +{ + if (!h) + return; + + module_put(h->owner); +} + +long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vzioctlinfo *h; + int err; + + err = -ENOTTY; + h = vzctl_get_handler(cmd); + if (h && h->ioctl) + err = (*h->ioctl)(file, cmd, arg); + vzctl_put_handler(h); + + return err; +} + +long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vzioctlinfo *h; + int err; + + err = -ENOIOCTLCMD; + h = vzctl_get_handler(cmd); + if (h && h->compat_ioctl) + err = (*h->compat_ioctl)(file, cmd, arg); + vzctl_put_handler(h); + + return err; +} + +void vzioctl_register(struct vzioctlinfo *inf) +{ + spin_lock(&ioctl_lock); + list_add(&inf->list, &ioctls); + spin_unlock(&ioctl_lock); +} +EXPORT_SYMBOL(vzioctl_register); + +void vzioctl_unregister(struct vzioctlinfo *inf) +{ + spin_lock(&ioctl_lock); + list_del_init(&inf->list); + spin_unlock(&ioctl_lock); +} +EXPORT_SYMBOL(vzioctl_unregister); + +/* + * Init/exit stuff. + */ +static struct file_operations vzctl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vzctl_ioctl, + .compat_ioctl = compat_vzctl_ioctl, +}; + +static struct class *vzctl_class; + +static void __exit vzctl_exit(void) +{ + device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0)); + class_destroy(vzctl_class); + unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); +} + +static int __init vzctl_init(void) +{ + int ret; + struct device *class_err; + + ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops); + if (ret < 0) + goto out; + + vzctl_class = class_create(THIS_MODULE, "vzctl"); + if (IS_ERR(vzctl_class)) { + ret = PTR_ERR(vzctl_class); + goto out_cleandev; + } + + class_err = device_create(vzctl_class, NULL, + MKDEV(VZCTL_MAJOR, 0), NULL, VZCTL_NAME); + if (IS_ERR(class_err)) { + ret = PTR_ERR(class_err); + goto out_rmclass; + } + + goto out; + +out_rmclass: + class_destroy(vzctl_class); +out_cleandev: + unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); +out: + return ret; +} + +module_init(vzctl_init) +module_exit(vzctl_exit); diff -urNp linux-2.6.32.48/kernel/ve/vzevent.c linux-2.6.32.48-openvz/kernel/ve/vzevent.c --- linux-2.6.32.48/kernel/ve/vzevent.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ve/vzevent.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define NETLINK_UEVENT 31 +#define VZ_EVGRP_ALL 0x01 + +static int reboot_event; +module_param(reboot_event, int, 0644); +MODULE_PARM_DESC(reboot_event, "Enable reboot events"); + +/* + * NOTE: the original idea was to send events via kobject_uevent(), + * however, it turns out that it has negative consequences like + * start of /sbin/hotplug which tries to react on our events in inadequate manner. + */ + +static struct sock *vzev_sock; + +static char *action_to_string(int action) +{ + switch (action) { + case VE_EVENT_MOUNT: + return "ve-mount"; + case VE_EVENT_UMOUNT: + return "ve-umount"; + case VE_EVENT_START: + return "ve-start"; + case VE_EVENT_STOP: + return "ve-stop"; + case VE_EVENT_REBOOT: + return "ve-reboot"; + default: + return NULL; + } +} + +static int do_vzevent_send(int event, char *msg, int len) +{ + struct sk_buff *skb; + char *buf, *action; + int alen; + + action = action_to_string(event); + if (!action) + return -EINVAL; + + alen = strlen(action); + + skb = alloc_skb(len + 1 + alen, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + buf = skb_put(skb, len + 1 + alen); + memcpy(buf, action, alen); + buf[alen] = '@'; + memcpy(buf + alen + 1, msg, len); + (void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL); + return 0; +} + +int vzevent_send(int event, const char *attrs_fmt, ...) +{ + va_list args; + int len, err; + struct ve_struct *ve; + char *page; + + err = -ENOMEM; + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + goto out; + + va_start(args, attrs_fmt); + len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args); + va_end(args); + + ve = set_exec_env(get_ve0()); + err = do_vzevent_send(event, page, len); + (void)set_exec_env(ve); + free_page((unsigned long)page); +out: + return err; +} +EXPORT_SYMBOL(vzevent_send); + +static int ve_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + vzevent_send(VE_EVENT_START, "%d", ve->veid); + return 0; +} + +static void ve_stop(void *data) +{ + struct ve_struct *ve; + int event = VE_EVENT_STOP; + + if (test_and_clear_bit(VE_REBOOT, &get_exec_env()->flags) && + reboot_event) + event = VE_EVENT_REBOOT; + + ve = (struct ve_struct *)data; + vzevent_send(event, "%d", ve->veid); +} + +static struct ve_hook ve_start_stop_hook = { + .init = ve_start, + .fini = ve_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_AFTERALL, +}; + +static int __init init_vzevent(void) +{ + vzev_sock = netlink_kernel_create(&init_net, NETLINK_UEVENT, 0, NULL, NULL, THIS_MODULE); + if (vzev_sock == NULL) + return -ENOMEM; + ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook); + return 0; +} + +static void __exit exit_vzevent(void) +{ + ve_hook_unregister(&ve_start_stop_hook); + sock_release(vzev_sock->sk_socket); +} + +MODULE_LICENSE("GPL"); + +module_init(init_vzevent); +module_exit(exit_vzevent); diff -urNp linux-2.6.32.48/kernel/ve/vzwdog.c linux-2.6.32.48-openvz/kernel/ve/vzwdog.c --- linux-2.6.32.48/kernel/ve/vzwdog.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.32.48-openvz/kernel/ve/vzwdog.c 2011-11-21 17:40:47.000000000 -0500 @@ -0,0 +1,322 @@ +/* + * kernel/ve/vzwdog.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Staff regading kernel thread polling VE validity */ +static int sleep_timeout = 60; +static struct task_struct *wdog_thread_tsk; + +extern void show_mem(void); + +static struct file *intr_file; +static char page[PAGE_SIZE]; + +static void parse_irq_list(int len) +{ + int i, k, skip; + for (i = 0; i < len; ) { + k = i; + while (i < len && page[i] != '\n' && page[i] != ':') + i++; + skip = 0; + if (i < len && page[i] != '\n') { + i++; /* skip ':' */ + while (i < len && (page[i] == ' ' || page[i] == '0')) + i++; + skip = (i < len && (page[i] < '0' || page[i] > '9')); + while (i < len && page[i] != '\n') + i++; + } + if (!skip) + printk("%.*s\n", i - k, page + k); + if (i < len) + i++; /* skip '\n' */ + } +} + +extern loff_t vfs_llseek(struct file *file, loff_t, int); +extern ssize_t vfs_read(struct file *file, char __user *, size_t, loff_t *); +extern struct file *filp_open(const char *filename, int flags, int mode); +extern int filp_close(struct file *filp, fl_owner_t id); +static void show_irq_list(void) +{ + mm_segment_t fs; + int r; + + fs = get_fs(); + set_fs(KERNEL_DS); + vfs_llseek(intr_file, 0, 0); + r = vfs_read(intr_file, (void __user *)page, sizeof(page), + &intr_file->f_pos); + set_fs(fs); + + if (r > 0) + parse_irq_list(r); +} + +static void show_alloc_latency(void) +{ + static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = { + "A0", + "L0", + "H0", + "L1", + "H1" + }; + int i; + + printk("lat: "); + for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) { + struct kstat_lat_struct *p; + cycles_t maxlat, avg0, avg1, avg2; + + p = &kstat_glob.alloc_lat[i]; + spin_lock_irq(&kstat_glb_lock); + maxlat = p->last.maxlat; + avg0 = p->avg[0]; + avg1 = p->avg[1]; + avg2 = p->avg[2]; + spin_unlock_irq(&kstat_glb_lock); + + printk("%s %Lu (%Lu %Lu %Lu)", + alloc_descr[i], + (unsigned long long)maxlat, + (unsigned long long)avg0, + (unsigned long long)avg1, + (unsigned long long)avg2); + } + printk("\n"); +} + +static void show_schedule_latency(void) +{ + struct kstat_lat_pcpu_struct *p; + cycles_t maxlat, totlat, avg0, avg1, avg2; + unsigned long count; + + p = &kstat_glob.sched_lat; + spin_lock_irq(&kstat_glb_lock); + maxlat = p->last.maxlat; + totlat = p->last.totlat; + count = p->last.count; + avg0 = p->avg[0]; + avg1 = p->avg[1]; + avg2 = p->avg[2]; + spin_unlock_irq(&kstat_glb_lock); + + printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n", + (unsigned long long)maxlat, + (unsigned long long)totlat, + count, + (unsigned long long)avg0, + (unsigned long long)avg1, + (unsigned long long)avg2); +} + +static void show_header(void) +{ + struct timeval tv; + + do_gettimeofday(&tv); + preempt_disable(); + printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n", + tv.tv_sec, (long)tv.tv_usec, + (unsigned long long)get_jiffies_64(), + smp_processor_id()); +#ifdef CONFIG_FAIRSCHED + printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n", + cycles_per_jiffy, HZ); +#else + printk("*** jiffies_per_second %u ***\n", HZ); +#endif + preempt_enable(); +} + +static void show_pgdatinfo(void) +{ + pg_data_t *pgdat; + + printk("pgdat:"); + for_each_online_pgdat(pgdat) { + printk(" %d: %lu,%lu,%lu", + pgdat->node_id, + pgdat->node_start_pfn, + pgdat->node_present_pages, + pgdat->node_spanned_pages); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(",%p", pgdat->node_mem_map); +#endif + } + printk("\n"); +} + +static int show_partitions_io(struct gendisk *gp) +{ + struct disk_part_iter piter; + struct hd_struct *hd; + char buf[BDEVNAME_SIZE]; + int cpu; + + /* + if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) + seq_puts(seqf, "major minor name" + " rio rmerge rsect ruse wio wmerge " + "wsect wuse running use aveq" + "\n\n"); + */ + + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); + while ((hd = disk_part_iter_next(&piter))) { + cpu = part_stat_lock(); + part_round_stats(cpu, hd); + part_stat_unlock(); + printk("%4d %7d %s %lu %lu %llu " + "%u %lu %lu %llu %u %u %u %u\n", + MAJOR(part_devt(hd)), MINOR(part_devt(hd)), + disk_name(gp, hd->partno, buf), + part_stat_read(hd, ios[0]), + part_stat_read(hd, merges[0]), + (unsigned long long)part_stat_read(hd, sectors[0]), + jiffies_to_msecs(part_stat_read(hd, ticks[0])), + part_stat_read(hd, ios[1]), + part_stat_read(hd, merges[1]), + (unsigned long long)part_stat_read(hd, sectors[1]), + jiffies_to_msecs(part_stat_read(hd, ticks[1])), + part_in_flight(hd), + jiffies_to_msecs(part_stat_read(hd, io_ticks)), + jiffies_to_msecs(part_stat_read(hd, time_in_queue)) + ); + } + disk_part_iter_exit(&piter); + + return 0; +} + +static int show_one_disk_io(struct device *dev, void *x) +{ + char *name; + char buf[BDEVNAME_SIZE]; + struct gendisk *gd; + + gd = dev_to_disk(dev); + + name = disk_name(gd, 0, buf); + if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) && + isdigit(name[4])) + return 0; + + if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) && + isdigit(name[3])) + return 0; + + show_partitions_io(gd); + + return 0; +} + +static void show_diskio(void) +{ + printk("disk_io: "); + class_for_each_device(&block_class, NULL, NULL, show_one_disk_io); + printk("\n"); +} + +static void show_nrprocs(void) +{ + unsigned long _nr_running, _nr_sleeping, + _nr_unint, _nr_zombie, _nr_dead, _nr_stopped; + + _nr_running = nr_running(); + _nr_unint = nr_uninterruptible(); + _nr_sleeping = nr_sleeping(); + _nr_zombie = nr_zombie; + _nr_dead = atomic_read(&nr_dead); + _nr_stopped = nr_stopped(); + + printk("VEnum: %d, proc R %lu, S %lu, D %lu, " + "Z %lu, X %lu, T %lu (tot %d)\n", + nr_ve, _nr_running, _nr_sleeping, _nr_unint, + _nr_zombie, _nr_dead, _nr_stopped, nr_threads); +} + +static void wdog_print(void) +{ + show_header(); + show_irq_list(); + show_pgdatinfo(); + show_mem(); + show_diskio(); + show_schedule_latency(); + show_alloc_latency(); + show_nrprocs(); +} + +static int wdog_loop(void* data) +{ + while (1) { + wdog_print(); + try_to_freeze(); + + set_current_state(TASK_UNINTERRUPTIBLE); + if (kthread_should_stop()) + break; + schedule_timeout(sleep_timeout*HZ); + } + return 0; +} + +static int __init wdog_init(void) +{ + struct file *file; + + file = filp_open("/proc/interrupts", 0, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + intr_file = file; + + wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog"); + if (IS_ERR(wdog_thread_tsk)) { + filp_close(intr_file, NULL); + return -EBUSY; + } + return 0; +} + +static void __exit wdog_exit(void) +{ + kthread_stop(wdog_thread_tsk); + filp_close(intr_file, NULL); +} + +module_param(sleep_timeout, int, 0660); +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo WDOG"); +MODULE_LICENSE("GPL v2"); + +module_init(wdog_init) +module_exit(wdog_exit) diff -urNp linux-2.6.32.48/lib/is_single_threaded.c linux-2.6.32.48-openvz/lib/is_single_threaded.c --- linux-2.6.32.48/lib/is_single_threaded.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/lib/is_single_threaded.c 2011-11-21 17:40:47.000000000 -0500 @@ -30,7 +30,7 @@ bool current_is_single_threaded(void) ret = false; rcu_read_lock(); - for_each_process(p) { + for_each_process_ve(p) { if (unlikely(p->flags & PF_KTHREAD)) continue; if (unlikely(p == task->group_leader)) @@ -48,7 +48,7 @@ bool current_is_single_threaded(void) * forked before exiting. */ smp_rmb(); - } while_each_thread(p, t); + } while_each_thread_ve(p, t); } ret = true; found: diff -urNp linux-2.6.32.48/lib/Kconfig.debug linux-2.6.32.48-openvz/lib/Kconfig.debug --- linux-2.6.32.48/lib/Kconfig.debug 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/lib/Kconfig.debug 2011-11-21 17:40:47.000000000 -0500 @@ -136,6 +136,15 @@ config DEBUG_SECTION_MISMATCH - Enable verbose reporting from modpost to help solving the section mismatches reported. +config SYSRQ_DEBUG + bool "Debugging via sysrq keys" + depends on MAGIC_SYSRQ + default y + help + Say Y if you want to extend functionality of magic key. It will + provide you with some debugging facilities such as dumping and + writing memory, resolving symbols and some other. + config DEBUG_KERNEL bool "Kernel debugging" help diff -urNp linux-2.6.32.48/lib/kobject_uevent.c linux-2.6.32.48-openvz/lib/kobject_uevent.c --- linux-2.6.32.48/lib/kobject_uevent.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/lib/kobject_uevent.c 2011-11-21 17:40:47.000000000 -0500 @@ -38,6 +38,8 @@ static const char *kobject_actions[] = { [KOBJ_REMOVE] = "remove", [KOBJ_CHANGE] = "change", [KOBJ_MOVE] = "move", + [KOBJ_START] = "start", + [KOBJ_STOP] = "stop", [KOBJ_ONLINE] = "online", [KOBJ_OFFLINE] = "offline", }; diff -urNp linux-2.6.32.48/lib/nlattr.c linux-2.6.32.48-openvz/lib/nlattr.c --- linux-2.6.32.48/lib/nlattr.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/lib/nlattr.c 2011-11-21 17:40:47.000000000 -0500 @@ -196,7 +196,7 @@ int nla_parse(struct nlattr *tb[], int m } if (unlikely(rem > 0)) - printk(KERN_WARNING "netlink: %d bytes leftover after parsing " + ve_printk(VE_LOG, KERN_WARNING "netlink: %d bytes leftover after parsing " "attributes.\n", rem); err = 0; diff -urNp linux-2.6.32.48/lib/show_mem.c linux-2.6.32.48-openvz/lib/show_mem.c --- linux-2.6.32.48/lib/show_mem.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/lib/show_mem.c 2011-11-21 17:40:47.000000000 -0500 @@ -8,6 +8,7 @@ #include #include #include +#include void show_mem(void) { @@ -61,3 +62,4 @@ void show_mem(void) quicklist_total_size()); #endif } +EXPORT_SYMBOL_GPL(show_mem); diff -urNp linux-2.6.32.48/Makefile linux-2.6.32.48-openvz/Makefile --- linux-2.6.32.48/Makefile 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/Makefile 2011-11-21 17:40:47.000000000 -0500 @@ -352,7 +352,7 @@ KBUILD_AFLAGS := -D__ASSEMBLY__ KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null) KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) -export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION +export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION VZVERSION export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC export CPP AR NM STRIP OBJCOPY OBJDUMP export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE @@ -1033,7 +1033,8 @@ define filechk_utsrelease.h echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \ exit 1; \ fi; \ - (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";) + (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; \ + echo \#define VZVERSION \"$(VZVERSION)\";) endef define filechk_version.h diff -urNp linux-2.6.32.48/mm/filemap.c linux-2.6.32.48-openvz/mm/filemap.c --- linux-2.6.32.48/mm/filemap.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/filemap.c 2011-11-21 17:40:47.000000000 -0500 @@ -42,6 +42,7 @@ #include /* for try_to_free_buffers */ #include +#include /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -121,6 +122,7 @@ void __remove_from_page_cache(struct pag radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + ub_io_release_debug(page); mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) diff -urNp linux-2.6.32.48/mm/filemap_xip.c linux-2.6.32.48-openvz/mm/filemap_xip.c --- linux-2.6.32.48/mm/filemap_xip.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/filemap_xip.c 2011-11-21 17:40:47.000000000 -0500 @@ -19,6 +19,7 @@ #include #include #include +#include /* * We do use our own empty page to avoid interference with other users @@ -194,6 +195,8 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); + pb_remove_ref(page, mm); + ub_unused_privvm_inc(mm, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); diff -urNp linux-2.6.32.48/mm/fremap.c linux-2.6.32.48-openvz/mm/fremap.c --- linux-2.6.32.48/mm/fremap.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/fremap.c 2011-11-21 17:40:47.000000000 -0500 @@ -21,6 +21,8 @@ #include #include +#include + #include "internal.h" static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, @@ -38,6 +40,7 @@ static void zap_pte(struct mm_struct *mm if (pte_dirty(pte)) set_page_dirty(page); page_remove_rmap(page); + pb_remove_ref(page, mm); page_cache_release(page); update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); @@ -64,8 +67,10 @@ static int install_file_pte(struct mm_st if (!pte) goto out; - if (!pte_none(*pte)) + if (!pte_none(*pte)) { zap_pte(mm, vma, addr, pte); + ub_unused_privvm_inc(mm, vma); + } set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); /* @@ -222,7 +227,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsign * drop PG_Mlocked flag for over-mapped range */ unsigned int saved_flags = vma->vm_flags; - munlock_vma_pages_range(vma, start, start + size); + __munlock_vma_pages_range(vma, start, start + size, 0); vma->vm_flags = saved_flags; } @@ -258,3 +263,4 @@ out: return err; } +EXPORT_SYMBOL_GPL(sys_remap_file_pages); diff -urNp linux-2.6.32.48/mm/internal.h linux-2.6.32.48-openvz/mm/internal.h --- linux-2.6.32.48/mm/internal.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/internal.h 2011-11-21 17:40:47.000000000 -0500 @@ -66,8 +66,14 @@ static inline unsigned long page_order(s #ifdef CONFIG_HAVE_MLOCK extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); +extern void __munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int acct); +static inline void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + __munlock_vma_pages_range(vma, start, end, 1); +} + static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); diff -urNp linux-2.6.32.48/mm/memory.c linux-2.6.32.48-openvz/mm/memory.c --- linux-2.6.32.48/mm/memory.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/memory.c 2011-11-21 17:40:47.000000000 -0500 @@ -42,6 +42,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -57,6 +60,11 @@ #include #include +#include +#include +#include +#include + #include #include #include @@ -94,7 +102,7 @@ EXPORT_SYMBOL(high_memory); * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, * as ancient (libc5 based) binaries can segfault. ) */ -int randomize_va_space __read_mostly = +int _randomize_va_space __read_mostly = #ifdef CONFIG_COMPAT_BRK 1; #else @@ -132,18 +140,21 @@ void pgd_clear_bad(pgd_t *pgd) pgd_ERROR(*pgd); pgd_clear(pgd); } +EXPORT_SYMBOL_GPL(pgd_clear_bad); void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } +EXPORT_SYMBOL_GPL(pud_clear_bad); void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } +EXPORT_SYMBOL_GPL(pmd_clear_bad); /* * Note: this doesn't free the actual pages themselves. That @@ -356,6 +367,7 @@ int __pte_alloc(struct mm_struct *mm, pm pte_free(mm, new); return 0; } +EXPORT_SYMBOL_GPL(__pte_alloc); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) { @@ -565,6 +577,7 @@ check_pfn: out: return pfn_to_page(pfn); } +EXPORT_SYMBOL_GPL(vm_normal_page); /* * copy one vm_area from one task to the other. Assumes the page tables @@ -575,7 +588,7 @@ out: static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + unsigned long addr, int *rss, struct page_beancounter **pbc) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -630,6 +643,7 @@ copy_one_pte(struct mm_struct *dst_mm, s if (page) { get_page(page); page_dup_rmap(page); + pb_dup_ref(page, dst_mm, pbc); rss[PageAnon(page)]++; } @@ -637,21 +651,36 @@ out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); } +#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1))) +#ifdef CONFIG_BEANCOUNTERS +#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub) +#else +#define same_ub(mm1, mm2) 1 +#endif + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[2], rss_tot; + struct page_beancounter *pbc; + int err; + err = -ENOMEM; + pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL; again: + if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr))) + goto out; rss[1] = rss[0] = 0; dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) - return -ENOMEM; + goto out; src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -674,23 +703,32 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss, + &pbc); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(orig_src_pte); + rss_tot = rss[0] + rss[1]; + ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot); add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (addr != end) goto again; - return 0; + + err = 0; +out: + pb_free_list(&pbc); + return err; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + pud_t *dst_pud, pud_t *src_pud, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -705,14 +743,16 @@ static inline int copy_pmd_range(struct if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + pgd_t *dst_pgd, pgd_t *src_pgd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -727,19 +767,21 @@ static inline int copy_pud_range(struct if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) +int __copy_page_range(struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, + unsigned long addr, size_t size) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = vma->vm_mm; pgd_t *src_pgd, *dst_pgd; unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long end = addr + size; int ret; /* @@ -783,7 +825,7 @@ int copy_page_range(struct mm_struct *ds if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + dst_vma, vma, addr, next))) { ret = -ENOMEM; break; } @@ -794,6 +836,17 @@ int copy_page_range(struct mm_struct *ds vma->vm_start, end); return ret; } +EXPORT_SYMBOL_GPL(__copy_page_range); + +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *dst_vma, struct vm_area_struct *vma) +{ + if (dst_vma->vm_mm != dst) + BUG(); + if (vma->vm_mm != src) + BUG(); + return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start); +} static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, @@ -805,6 +858,7 @@ static unsigned long zap_pte_range(struc spinlock_t *ptl; int file_rss = 0; int anon_rss = 0; + int rss; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -860,6 +914,7 @@ static unsigned long zap_pte_range(struc file_rss--; } page_remove_rmap(page); + pb_remove_ref(page, mm); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); @@ -880,6 +935,8 @@ static unsigned long zap_pte_range(struc pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + rss = -(file_rss + anon_rss); + ub_unused_privvm_add(mm, vma, rss); add_mm_rss(mm, file_rss, anon_rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -2004,6 +2061,7 @@ static int do_wp_page(struct mm_struct * int reuse = 0, ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; + struct page_beancounter *pbc; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2110,6 +2168,8 @@ reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (old_page) + ClearPageCheckpointed(old_page); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; @@ -2123,6 +2183,9 @@ reuse: gotten: pte_unmap_unlock(page_table, ptl); + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2157,12 +2220,15 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { + pb_remove_ref(old_page, mm); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); } - } else + } else { + ub_unused_privvm_dec(mm, vma); inc_mm_counter(mm, anon_rss); + } flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2174,6 +2240,7 @@ gotten: */ ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); + pb_add_ref(new_page, mm, &pbc); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -2217,6 +2284,7 @@ gotten: page_cache_release(new_page); if (old_page) page_cache_release(old_page); + pb_free(&pbc); unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { @@ -2256,6 +2324,8 @@ unlock: oom_free_new: page_cache_release(new_page); oom: + pb_free(&pbc); +oom_nopb: if (old_page) { if (page_mkwrite) { unlock_page(old_page); @@ -2514,10 +2584,16 @@ static int do_swap_page(struct mm_struct pte_t pte; struct mem_cgroup *ptr = NULL; int ret = 0; + struct page_beancounter *pbc; + cycles_t start; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - goto out; + goto out_nostat; + if (unlikely(pb_alloc(&pbc))) + return VM_FAULT_OOM; + + start = get_cycles(); entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { @@ -2592,6 +2668,7 @@ static int do_swap_page(struct mm_struct */ inc_mm_counter(mm, anon_rss); + ub_percpu_inc(mm->mm_ub, swapin); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2600,11 +2677,14 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page) + || swap_readonly(page)) try_to_free_swap(page); unlock_page(page); @@ -2620,6 +2700,11 @@ static int do_swap_page(struct mm_struct unlock: pte_unmap_unlock(page_table, ptl); out: + pb_free(&pbc); + spin_lock_irq(&kstat_glb_lock); + KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start); + spin_unlock_irq(&kstat_glb_lock); +out_nostat: return ret; out_nomap: mem_cgroup_cancel_charge_swapin(ptr); @@ -2627,6 +2712,7 @@ out_nomap: out_page: unlock_page(page); out_release: + pb_free(&pbc); page_cache_release(page); return ret; } @@ -2677,6 +2763,7 @@ static int do_anonymous_page(struct mm_s struct page *page; spinlock_t *ptl; pte_t entry; + struct page_beancounter *pbc = NULL; pte_unmap(page_table); @@ -2695,6 +2782,9 @@ static int do_anonymous_page(struct mm_s } /* Allocate our own private page. */ + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); @@ -2715,12 +2805,15 @@ static int do_anonymous_page(struct mm_s inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, entry); unlock: + pb_free(&pbc); pte_unmap_unlock(page_table, ptl); return 0; release: @@ -2730,6 +2823,8 @@ release: oom_free_page: page_cache_release(page); oom: + pb_free(&pbc); +oom_nopb: return VM_FAULT_OOM; } @@ -2757,6 +2852,7 @@ static int __do_fault(struct mm_struct * int anon = 0; int charged = 0; struct page *dirty_page = NULL; + struct page_beancounter *pbc; struct vm_fault vmf; int ret; int page_mkwrite = 0; @@ -2766,9 +2862,13 @@ static int __do_fault(struct mm_struct * vmf.flags = flags; vmf.page = NULL; + ret = VM_FAULT_OOM; + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) - return ret; + goto out_fault; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) @@ -2862,6 +2962,8 @@ static int __do_fault(struct mm_struct * */ /* Only go through if we didn't race with anybody else... */ if (likely(pte_same(*page_table, orig_pte))) { + struct user_beancounter *ub; + flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) @@ -2878,6 +2980,25 @@ static int __do_fault(struct mm_struct * } } set_pte_at(mm, address, page_table, entry); + ub = page_ub(page); + if (ub != NULL && +#ifdef CONFIG_BC_IO_ACCOUNTING + !((unsigned long)ub & PAGE_IO_MARK) && +#endif + ub->ub_magic == UB_MAGIC) { + /* + * WOW: Page was already charged as page_ub. This may + * happens for example then some driver export its low + * memory pages to user space. We can't account page as + * page_ub and page_bp at the same time. So uncharge + * page from UB counter. + */ + WARN_ON_ONCE(1); + ub_page_uncharge(page, 0); + } + + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, entry); @@ -2917,6 +3038,9 @@ out: page_cache_release(vmf.page); } +out_fault: + pb_free(&pbc); +oom_nopb: return ret; unwritable_page: @@ -3044,6 +3168,27 @@ int handle_mm_fault(struct mm_struct *mm pmd_t *pmd; pte_t *pte; +#ifdef CONFIG_VZ_GENCALLS + do { + int ret; +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter *tbc; + + tbc = ¤t->task_bc; + if (!test_bit(UB_AFLAG_NOTIF_PAGEIN, &mm->mm_ub->ub_aflags) && + tbc->pgfault_allot) { + tbc->pgfault_allot--; + break; /* skip notifier */ + } +#endif + ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_PAGEIN, + (void *)1); + if (ret & NOTIFY_FAIL) + return VM_FAULT_SIGBUS; + if (ret & NOTIFY_OK) + return VM_FAULT_MINOR; /* retry */ + } while (0); +#endif __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); @@ -3088,6 +3233,8 @@ int __pud_alloc(struct mm_struct *mm, pg } #endif /* __PAGETABLE_PUD_FOLDED */ +EXPORT_SYMBOL_GPL(__pud_alloc); + #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. @@ -3118,6 +3265,8 @@ int __pmd_alloc(struct mm_struct *mm, pu } #endif /* __PAGETABLE_PMD_FOLDED */ +EXPORT_SYMBOL_GPL(__pmd_alloc); + int make_pages_present(unsigned long addr, unsigned long end) { int ret, len, write; @@ -3137,6 +3286,8 @@ int make_pages_present(unsigned long add return ret == len ? 0 : -EFAULT; } +EXPORT_SYMBOL(make_pages_present); + #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff -urNp linux-2.6.32.48/mm/memory-failure.c linux-2.6.32.48-openvz/mm/memory-failure.c --- linux-2.6.32.48/mm/memory-failure.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/memory-failure.c 2011-11-21 17:40:47.000000000 -0500 @@ -226,7 +226,7 @@ static void collect_procs_anon(struct pa av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ goto out; - for_each_process (tsk) { + for_each_process_all (tsk) { if (!task_early_kill(tsk)) continue; list_for_each_entry (vma, &av->head, anon_vma_node) { @@ -263,7 +263,7 @@ static void collect_procs_file(struct pa read_lock(&tasklist_lock); spin_lock(&mapping->i_mmap_lock); - for_each_process(tsk) { + for_each_process_all(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); if (!task_early_kill(tsk)) diff -urNp linux-2.6.32.48/mm/mempool.c linux-2.6.32.48-openvz/mm/mempool.c --- linux-2.6.32.48/mm/mempool.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/mempool.c 2011-11-21 17:40:47.000000000 -0500 @@ -77,6 +77,8 @@ mempool_t *mempool_create_node(int min_n init_waitqueue_head(&pool->wait); pool->alloc = alloc_fn; pool->free = free_fn; + if (alloc_fn == mempool_alloc_slab) + kmem_mark_nocharge((struct kmem_cache *)pool_data); /* * First pre-allocate the guaranteed number of buffers. @@ -118,6 +120,7 @@ int mempool_resize(mempool_t *pool, int unsigned long flags; BUG_ON(new_min_nr <= 0); + gfp_mask &= ~__GFP_UBC; spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -211,6 +214,7 @@ void * mempool_alloc(mempool_t *pool, gf gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ + gfp_mask &= ~__GFP_UBC; gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); diff -urNp linux-2.6.32.48/mm/mlock.c linux-2.6.32.48-openvz/mm/mlock.c --- linux-2.6.32.48/mm/mlock.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/mlock.c 2011-11-21 17:40:47.000000000 -0500 @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" @@ -322,12 +323,14 @@ no_mlock: * and re-mlocked by try_to_{munlock|unmap} before we unmap and * free them. This will result in freeing mlocked pages. */ -void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +void __munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int acct) { unsigned long addr; lru_add_drain(); + if (acct) + ub_locked_uncharge(vma->vm_mm, end - start); vma->vm_flags &= ~VM_LOCKED; for (addr = start; addr < end; addr += PAGE_SIZE) { @@ -387,6 +390,12 @@ static int mlock_fixup(struct vm_area_st goto out; /* don't set VM_LOCKED, don't count */ } + if (newflags & VM_LOCKED) { + ret = ub_locked_charge(mm, end - start); + if (ret < 0) + goto out; + } + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -398,13 +407,13 @@ static int mlock_fixup(struct vm_area_st if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) - goto out; + goto out_uncharge; } if (end != vma->vm_end) { ret = split_vma(mm, vma, end, 0); if (ret) - goto out; + goto out_uncharge; } success: @@ -434,6 +443,11 @@ success: out: *prev = vma; return ret; + +out_uncharge: + if (newflags & VM_LOCKED) + ub_locked_uncharge(mm, end - start); + goto out; } static int do_mlock(unsigned long start, size_t len, int on) @@ -512,6 +526,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, st up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL_GPL(sys_mlock); SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { @@ -524,6 +539,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, up_write(¤t->mm->mmap_sem); return ret; } +EXPORT_SYMBOL_GPL(sys_munlock); static int do_mlockall(int flags) { diff -urNp linux-2.6.32.48/mm/mmap.c linux-2.6.32.48-openvz/mm/mmap.c --- linux-2.6.32.48/mm/mmap.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/mmap.c 2011-11-21 17:40:47.000000000 -0500 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -41,10 +42,13 @@ #define arch_mmap_check(addr, len, flags) (0) #endif +#include + #ifndef arch_rebalance_pgtables #define arch_rebalance_pgtables(addr, len) (addr) #endif +static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft); static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -110,6 +114,18 @@ int __vm_enough_memory(struct mm_struct vm_acct_memory(pages); +#ifdef CONFIG_BEANCOUNTERS + switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM, + (void *)pages) + & (NOTIFY_OK | NOTIFY_FAIL)) { + case NOTIFY_OK: + return 0; + case NOTIFY_FAIL: + vm_unacct_memory(pages); + return -ENOMEM; + } +#endif + /* * Sometimes we want to use more memory than we have */ @@ -231,6 +247,9 @@ static struct vm_area_struct *remove_vma struct vm_area_struct *next = vma->vm_next; might_sleep(); + + ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start, + vma->vm_flags, vma->vm_file); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) { @@ -288,7 +307,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk) goto out; set_brk: mm->brk = brk; @@ -1116,6 +1135,7 @@ unsigned long mmap_region(struct file *f struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + unsigned long ub_charged = 0; /* Clear old maps */ error = -ENOMEM; @@ -1155,6 +1175,11 @@ munmap_back: vm_flags |= VM_ACCOUNT; } + if (ub_memory_charge(mm, len, vm_flags, file, + (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) + goto charge_error; + ub_charged = 1; + /* * Can we just expand an old mapping? */ @@ -1167,7 +1192,8 @@ munmap_back: * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | + (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0)); if (!vma) { error = -ENOMEM; goto unacct_error; @@ -1197,6 +1223,19 @@ munmap_back: goto unmap_and_free_vma; if (vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); + if (vm_flags != vma->vm_flags) { + /* + * ->vm_flags has been changed in f_op->mmap method. + * We have to recharge ub memory. + */ + ub_memory_uncharge(mm, len, vm_flags, file); + if (ub_memory_charge(mm, len, vma->vm_flags, file, + (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) { + ub_charged = 0; + error = -ENOMEM; + goto unmap_and_free_vma; + } + } /* Can addr have changed?? * @@ -1250,6 +1289,9 @@ unmap_and_free_vma: free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: + if (ub_charged) + ub_memory_uncharge(mm, len, vm_flags, file); +charge_error: if (charged) vm_unacct_memory(charged); return error; @@ -1580,12 +1622,16 @@ static int acct_stack_growth(struct vm_a if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; + if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags, + vma->vm_file, UB_SOFT)) + goto fail_charge; + /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory_mm(mm, grow)) - return -ENOMEM; + goto fail_sec; /* Ok, everything looks good - let it rip */ mm->total_vm += grow; @@ -1593,6 +1639,11 @@ static int acct_stack_growth(struct vm_a mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; + +fail_sec: + ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file); +fail_charge: + return -ENOMEM; } #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) @@ -1885,6 +1936,7 @@ int split_vma(struct mm_struct * mm, str return 0; } +EXPORT_SYMBOL_GPL(split_vma); /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the @@ -1992,7 +2044,7 @@ static inline void verify_mm_writelocked * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -unsigned long do_brk(unsigned long addr, unsigned long len) +static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft) { struct mm_struct * mm = current->mm; struct vm_area_struct * vma, * prev; @@ -2052,8 +2104,11 @@ unsigned long do_brk(unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + if (ub_memory_charge(mm, len, flags, NULL, soft)) + goto fail_charge; + if (security_vm_enough_memory(len >> PAGE_SHIFT)) - return -ENOMEM; + goto fail_sec; /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, @@ -2064,11 +2119,10 @@ unsigned long do_brk(unsigned long addr, /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; - } + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | + (soft == UB_SOFT ? __GFP_SOFT_UBC : 0)); + if (!vma) + goto fail_alloc; vma->vm_mm = mm; vma->vm_start = addr; @@ -2084,8 +2138,19 @@ out: mm->locked_vm += (len >> PAGE_SHIFT); } return addr; + +fail_alloc: + vm_unacct_memory(len >> PAGE_SHIFT); +fail_sec: + ub_memory_uncharge(mm, len, flags, NULL); +fail_charge: + return -ENOMEM; } +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + return __do_brk(addr, len, UB_SOFT); +} EXPORT_SYMBOL(do_brk); /* Release all mmaps. */ @@ -2278,10 +2343,11 @@ static void special_mapping_close(struct { } -static const struct vm_operations_struct special_mapping_vmops = { +const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, }; +EXPORT_SYMBOL_GPL(special_mapping_vmops); /* * Called with mm->mmap_sem held for writing. diff -urNp linux-2.6.32.48/mm/mmzone.c linux-2.6.32.48-openvz/mm/mmzone.c --- linux-2.6.32.48/mm/mmzone.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/mmzone.c 2011-11-21 17:40:47.000000000 -0500 @@ -14,6 +14,7 @@ struct pglist_data *first_online_pgdat(v { return NODE_DATA(first_online_node); } +EXPORT_SYMBOL_GPL(first_online_pgdat); struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) { @@ -23,6 +24,7 @@ struct pglist_data *next_online_pgdat(st return NULL; return NODE_DATA(nid); } +EXPORT_SYMBOL_GPL(next_online_pgdat); /* * next_zone - helper magic for for_each_zone() diff -urNp linux-2.6.32.48/mm/mprotect.c linux-2.6.32.48-openvz/mm/mprotect.c --- linux-2.6.32.48/mm/mprotect.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/mprotect.c 2011-11-21 17:40:47.000000000 -0500 @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -29,6 +30,8 @@ #include #include +#include + #ifndef pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { @@ -142,6 +145,8 @@ mprotect_fixup(struct vm_area_struct *vm unsigned long charged = 0; pgoff_t pgoff; int error; + unsigned long ch_size; + int ch_dir; int dirty_accountable = 0; if (newflags == oldflags) { @@ -149,6 +154,12 @@ mprotect_fixup(struct vm_area_struct *vm return 0; } + error = -ENOMEM; + ch_size = nrpages - pages_in_vma_range(vma, start, end); + ch_dir = ub_protected_charge(mm, ch_size, newflags, vma); + if (ch_dir == PRIVVM_ERROR) + goto fail_ch; + /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we @@ -160,7 +171,7 @@ mprotect_fixup(struct vm_area_struct *vm VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) - return -ENOMEM; + goto fail_sec; newflags |= VM_ACCOUNT; } } @@ -212,11 +223,17 @@ success: mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); + if (ch_dir == PRIVVM_TO_SHARED) + __ub_unused_privvm_dec(mm, ch_size); perf_event_mmap(vma); return 0; fail: vm_unacct_memory(charged); +fail_sec: + if (ch_dir == PRIVVM_TO_PRIVATE) + __ub_unused_privvm_dec(mm, ch_size); +fail_ch: return error; } @@ -318,3 +335,4 @@ out: up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL_GPL(sys_mprotect); diff -urNp linux-2.6.32.48/mm/mremap.c linux-2.6.32.48-openvz/mm/mremap.c --- linux-2.6.32.48/mm/mremap.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/mremap.c 2011-11-21 17:40:47.000000000 -0500 @@ -27,6 +27,8 @@ #include "internal.h" +#include + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -175,12 +177,16 @@ static unsigned long move_vma(struct vm_ int split = 0; int err; + if (ub_memory_charge(mm, new_len, vm_flags, + vma->vm_file, UB_HARD)) + goto err; + /* * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ if (mm->map_count >= sysctl_max_map_count - 3) - return -ENOMEM; + goto err_nomem; /* * Advise KSM to break any KSM pages in the area to be moved: @@ -192,12 +198,12 @@ static unsigned long move_vma(struct vm_ err = ksm_madvise(vma, old_addr, old_addr + old_len, MADV_UNMERGEABLE, &vm_flags); if (err) - return err; + goto err_nomem; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); if (!new_vma) - return -ENOMEM; + goto err_nomem; moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { @@ -256,7 +262,13 @@ static unsigned long move_vma(struct vm_ new_addr + new_len); } - return new_addr; + if (new_addr != -ENOMEM) + return new_addr; + +err_nomem: + ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file); +err: + return -ENOMEM; } static struct vm_area_struct *vma_to_resize(unsigned long addr, @@ -463,7 +475,13 @@ unsigned long do_mremap(unsigned long ad if (old_len == vma->vm_end - addr) { /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long len = (new_len - old_len); + int pages = len >> PAGE_SHIFT; + + ret = -ENOMEM; + if (ub_memory_charge(mm, len, vma->vm_flags, + vma->vm_file, UB_HARD)) + goto out; vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL); diff -urNp linux-2.6.32.48/mm/oom_kill.c linux-2.6.32.48-openvz/mm/oom_kill.c --- linux-2.6.32.48/mm/oom_kill.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/oom_kill.c 2011-11-21 17:40:47.000000000 -0500 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -28,6 +30,9 @@ #include #include +#include +#include + int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; @@ -224,16 +229,16 @@ static inline enum oom_constraint constr * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints, +struct task_struct *select_bad_process(struct user_beancounter *ub, struct mem_cgroup *mem) { struct task_struct *p; struct task_struct *chosen = NULL; struct timespec uptime; - *ppoints = 0; + unsigned long chosen_points = 0; do_posix_clock_monotonic_gettime(&uptime); - for_each_process(p) { + for_each_process_all(p) { unsigned long points; /* @@ -247,6 +252,8 @@ static struct task_struct *select_bad_pr continue; if (mem && !task_in_mem_cgroup(p, mem)) continue; + if (ub_oom_task_skip(ub, p)) + continue; /* * This task already has access to memory reserves and is @@ -275,16 +282,16 @@ static struct task_struct *select_bad_pr return ERR_PTR(-1UL); chosen = p; - *ppoints = ULONG_MAX; + chosen_points = ULONG_MAX; } if (p->signal->oom_adj == OOM_DISABLE) continue; points = badness(p, uptime.tv_sec); - if (points > *ppoints || !chosen) { + if (points > chosen_points || !chosen) { chosen = p; - *ppoints = points; + chosen_points = points; } } @@ -310,7 +317,7 @@ static void dump_tasks(const struct mem_ printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " "name\n"); - do_each_thread(g, p) { + do_each_thread_all(g, p) { struct mm_struct *mm; if (mem && !task_in_mem_cgroup(p, mem)) @@ -334,7 +341,7 @@ static void dump_tasks(const struct mem_ get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, p->comm); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); } /* @@ -369,10 +376,22 @@ static void __oom_kill_task(struct task_ set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); + ub_oom_task_killed(p); } static int oom_kill_task(struct task_struct *p) { + struct user_beancounter *ub; + + task_lock(p); + if (p->mm == NULL) { + task_unlock(p); + return 1; + } + + ub = get_beancounter(mm_ub(p->mm)); + task_unlock(p); + /* WARNING: mm may not be dereferenced since we did not obtain its * value from get_task_mm(p). This is OK since all we need to do is * compare mm to q->mm below. @@ -381,17 +400,18 @@ static int oom_kill_task(struct task_str * change to NULL at any time since we do not hold task_lock(p). * However, this is of no concern to us. */ - if (!p->mm || p->signal->oom_adj == OOM_DISABLE) + if (p->signal->oom_adj == OOM_DISABLE) return 1; __oom_kill_task(p, 1); + ub_oom_mm_killed(ub); + put_beancounter(ub); return 0; } -static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned long points, struct mem_cgroup *mem, - const char *message) +int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + struct mem_cgroup *mem, const char *message) { struct task_struct *c; @@ -419,8 +439,8 @@ static int oom_kill_process(struct task_ return 0; } - printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", - message, task_pid_nr(p), p->comm, points); + printk(KERN_ERR "%s: kill process %d (%s) or a child\n", + message, task_pid_nr(p), p->comm); /* Try to kill a child first */ list_for_each_entry(c, &p->children, sibling) { @@ -449,7 +469,7 @@ retry: if (!p) p = current; - if (oom_kill_process(p, gfp_mask, 0, points, mem, + if (oom_kill_process(p, gfp_mask, 0, mem, "Memory cgroup out of memory")) goto retry; out: @@ -527,31 +547,39 @@ void clear_zonelist_oom(struct zonelist static void __out_of_memory(gfp_t gfp_mask, int order) { struct task_struct *p; - unsigned long points; + struct user_beancounter *ub = NULL; if (sysctl_oom_kill_allocating_task) - if (!oom_kill_process(current, gfp_mask, order, 0, NULL, + if (!oom_kill_process(current, gfp_mask, order, NULL, "Out of memory (oom_kill_allocating_task)")) return; retry: + put_beancounter(ub); + /* * Rambo mode: Shoot down a process and hope it solves whatever * issues we may have. */ - p = select_bad_process(&points, NULL); + ub = ub_oom_select_worst(); + p = select_bad_process(ub, NULL); if (PTR_ERR(p) == -1UL) return; /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { + if (ub != NULL) + goto retry; + read_unlock(&tasklist_lock); + ub_oom_unlock(); panic("Out of memory and no killable processes...\n"); } - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) + if (oom_kill_process(p, gfp_mask, order, NULL, "Out of memory")) goto retry; + + put_beancounter(ub); } /* @@ -577,10 +605,27 @@ void pagefault_out_of_memory(void) if (sysctl_panic_on_oom) panic("out of memory from page fault. panic_on_oom is selected.\n"); + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL) + & (NOTIFY_OK | NOTIFY_FAIL)) + return; + + if (ub_oom_lock()) + goto rest_and_return; + + if (printk_ratelimit()) { + printk(KERN_WARNING "%s invoked PF oom-killer: oomkilladj=%d\n", + current->comm, current->signal->oom_adj); + dump_stack(); + show_mem(); + show_slab_info(); + } + read_lock(&tasklist_lock); __out_of_memory(0, 0); /* unknown gfp_mask and order */ read_unlock(&tasklist_lock); + ub_oom_unlock(); + /* * Give "p" a good chance of killing itself before we * retry to allocate memory. @@ -614,6 +659,23 @@ void out_of_memory(struct zonelist *zone if (sysctl_panic_on_oom == 2) panic("out of memory. Compulsory panic_on_oom is selected.\n"); + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL) + & (NOTIFY_OK | NOTIFY_FAIL)) + return; + + if (ub_oom_lock()) + goto out_oom_lock; + + if (printk_ratelimit()) { + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, + current->signal->oom_adj); + dump_stack(); + show_mem(); + show_slab_info(); + } + /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. @@ -623,7 +685,7 @@ void out_of_memory(struct zonelist *zone switch (constraint) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, 0, NULL, + oom_kill_process(current, gfp_mask, order, NULL, "No available memory (MPOL_BIND)"); break; @@ -637,7 +699,9 @@ void out_of_memory(struct zonelist *zone } read_unlock(&tasklist_lock); + ub_oom_unlock(); +out_oom_lock: /* * Give "p" a good chance of killing itself before we * retry to allocate memory unless "p" is current diff -urNp linux-2.6.32.48/mm/page_alloc.c linux-2.6.32.48-openvz/mm/page_alloc.c --- linux-2.6.32.48/mm/page_alloc.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/page_alloc.c 2011-11-21 17:40:47.000000000 -0500 @@ -54,6 +54,9 @@ #include #include "internal.h" +#include +#include + /* * Array of node states. */ @@ -105,6 +108,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z 32, }; +EXPORT_SYMBOL(nr_swap_pages); EXPORT_SYMBOL(totalram_pages); static char * const zone_names[MAX_NR_ZONES] = { @@ -510,6 +514,7 @@ static inline int free_pages_check(struc bad_page(page); return 1; } + ub_io_release_debug(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -602,6 +607,7 @@ static void __free_pages_ok(struct page arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); + ub_page_uncharge(page, order); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); @@ -1103,6 +1109,7 @@ static void free_hot_cold_page(struct pa pcp = &zone_pcp(zone, get_cpu())->pcp; migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); + ub_page_uncharge(page, 0); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); @@ -1796,6 +1803,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask) return alloc_flags; } +int alloc_fail_warn; + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -1917,7 +1926,7 @@ rebalance: } nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); @@ -1932,6 +1941,29 @@ got_pg: } +extern unsigned long cycles_per_jiffy; +static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order, + struct page *page, cycles_t time) +{ +#ifdef CONFIG_VE + int ind; + unsigned long flags; + + time = (jiffies - time) * cycles_per_jiffy; + if (!(gfp_mask & __GFP_WAIT)) + ind = 0; + else if (!(gfp_mask & __GFP_HIGHMEM)) + ind = (order > 0 ? 2 : 1); + else + ind = (order > 0 ? 4 : 3); + spin_lock_irqsave(&kstat_glb_lock, flags); + KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time); + if (!page) + kstat_glob.alloc_fails[ind]++; + spin_unlock_irqrestore(&kstat_glb_lock, flags); +#endif +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -1943,6 +1975,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u struct zone *preferred_zone; struct page *page; int migratetype = allocflags_to_migratetype(gfp_mask); + cycles_t start; gfp_mask &= gfp_allowed_mask; @@ -1966,6 +1999,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u if (!preferred_zone) return NULL; + start = jiffies; /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, @@ -1975,6 +2009,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + __alloc_collect_stats(gfp_mask, order, page, start); + if (page && ub_page_charge(page, order, gfp_mask)) { + __free_pages(page, order); + page = NULL; + } + trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; } diff -urNp linux-2.6.32.48/mm/page-writeback.c linux-2.6.32.48-openvz/mm/page-writeback.c --- linux-2.6.32.48/mm/page-writeback.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/page-writeback.c 2011-11-21 17:40:47.000000000 -0500 @@ -35,6 +35,8 @@ #include #include +#include + /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. @@ -1069,6 +1071,7 @@ int write_one_page(struct page *page, in } else { unlock_page(page); } + return ret; } EXPORT_SYMBOL(write_one_page); @@ -1087,14 +1090,15 @@ int __set_page_dirty_no_writeback(struct * Helper function for set_page_dirty family. * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping) +int account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); + return 1; } + return 0; } /* @@ -1114,6 +1118,9 @@ void account_page_dirtied(struct page *p */ int __set_page_dirty_nobuffers(struct page *page) { + int acct; + + acct = 0; if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -1121,16 +1128,19 @@ int __set_page_dirty_nobuffers(struct pa if (!mapping) return 1; + acct = 0; spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); + acct = account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&mapping->tree_lock); + if (acct) + task_io_account_write(page, PAGE_CACHE_SIZE, 0); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1268,6 +1278,7 @@ int clear_page_dirty_for_io(struct page dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + ub_io_release_context(page, PAGE_CACHE_SIZE); return 1; } return 0; diff -urNp linux-2.6.32.48/mm/rmap.c linux-2.6.32.48-openvz/mm/rmap.c --- linux-2.6.32.48/mm/rmap.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/rmap.c 2011-11-21 17:40:47.000000000 -0500 @@ -56,6 +56,9 @@ #include #include +#include +#include + #include #include "internal.h" @@ -133,6 +136,7 @@ int anon_vma_prepare(struct vm_area_stru } return 0; } +EXPORT_SYMBOL_GPL(anon_vma_prepare); void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { @@ -158,6 +162,7 @@ void anon_vma_link(struct vm_area_struct spin_unlock(&anon_vma->lock); } } +EXPORT_SYMBOL_GPL(anon_vma_link); void anon_vma_unlink(struct vm_area_struct *vma) { @@ -189,7 +194,7 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, anon_vma_ctor); } /* @@ -215,12 +220,14 @@ out: rcu_read_unlock(); return NULL; } +EXPORT_SYMBOL_GPL(page_lock_anon_vma); void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(page_unlock_anon_vma); /* * At what user virtual address is page expected in @vma? @@ -738,6 +745,12 @@ void page_remove_rmap(struct page *page) page_clear_dirty(page); set_page_dirty(page); } + /* + * Well, when a page is unmapped, we cannot keep PG_checkpointed + * flag, it is not accessible via process VM and we have no way + * to reset its state + */ + ClearPageCheckpointed(page); if (PageAnon(page)) { mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, NR_ANON_PAGES); @@ -851,6 +864,9 @@ static int try_to_unmap_one(struct page page_remove_rmap(page); + ub_unused_privvm_inc(mm, vma); + ub_percpu_inc(mm->mm_ub, unmap); + pb_remove_ref(page, mm); page_cache_release(page); out_unmap: @@ -966,6 +982,9 @@ static int try_to_unmap_cluster(unsigned set_page_dirty(page); page_remove_rmap(page); + ub_percpu_inc(mm->mm_ub, unmap); + pb_remove_ref(page, mm); + ub_unused_privvm_inc(mm, vma); page_cache_release(page); dec_mm_counter(mm, file_rss); (*mapcount)--; diff -urNp linux-2.6.32.48/mm/shmem.c linux-2.6.32.48-openvz/mm/shmem.c --- linux-2.6.32.48/mm/shmem.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/shmem.c 2011-11-21 17:40:47.000000000 -0500 @@ -31,7 +31,11 @@ #include #include +#ifdef CONFIG_VE +#define shm_mnt (get_exec_env()->shmem_mnt) +#else static struct vfsmount *shm_mnt; +#endif #ifdef CONFIG_SHMEM /* @@ -60,6 +64,8 @@ static struct vfsmount *shm_mnt; #include #include +#include + #include #include #include @@ -107,14 +113,31 @@ enum sgp_type { }; #ifdef CONFIG_TMPFS + +#include + +static unsigned long tmpfs_ram_pages(void) +{ + struct meminfo mi; + + if (ve_is_super(get_exec_env())) + return totalram_pages; + + memset(&mi, 0, sizeof(mi)); + si_meminfo(&mi.si); + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) & NOTIFY_FAIL) + return 0; + return mi.si.totalram; +} + static unsigned long shmem_default_max_blocks(void) { - return totalram_pages / 2; + return tmpfs_ram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); + return min(totalram_pages - totalhigh_pages, tmpfs_ram_pages() / 2); } #endif @@ -214,7 +237,7 @@ static inline void shmem_unacct_blocks(u static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; -static const struct file_operations shmem_file_operations; +const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; @@ -277,7 +300,7 @@ static void shmem_free_inode(struct supe * * It has to be called with the spinlock held. */ -static void shmem_recalc_inode(struct inode *inode) +static void shmem_recalc_inode(struct inode *inode, long swp_freed) { struct shmem_inode_info *info = SHMEM_I(inode); long freed; @@ -287,6 +310,8 @@ static void shmem_recalc_inode(struct in info->alloced -= freed; shmem_unacct_blocks(info->flags, freed); shmem_free_blocks(inode, freed); + if (freed > swp_freed) + ub_tmpfs_respages_sub(info, freed - swp_freed); } } @@ -391,6 +416,11 @@ static void shmem_swp_set(struct shmem_i struct page *page = kmap_atomic_to_page(entry); set_page_private(page, page_private(page) + incdec); } + + if (incdec == 1) + ub_tmpfs_respages_dec(info); + else + ub_tmpfs_respages_inc(info); } /** @@ -407,14 +437,24 @@ static swp_entry_t *shmem_swp_alloc(stru struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct page *page = NULL; swp_entry_t *entry; + unsigned long ub_val; if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return ERR_PTR(-EINVAL); + ub_val = 0; + if (info->next_index <= index) { + ub_val = index + 1 - info->next_index; + if (ub_shmpages_charge(info, ub_val)) + return ERR_PTR(-ENOSPC); + } + while (!(entry = shmem_swp_entry(info, index, &page))) { - if (sgp == SGP_READ) - return shmem_swp_map(ZERO_PAGE(0)); + if (sgp == SGP_READ) { + entry = shmem_swp_map(ZERO_PAGE(0)); + goto out; + } /* * Test free_blocks against 1 not 0, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: @@ -424,7 +464,8 @@ static swp_entry_t *shmem_swp_alloc(stru spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks <= 1) { spin_unlock(&sbinfo->stat_lock); - return ERR_PTR(-ENOSPC); + entry = ERR_PTR(-ENOSPC); + goto out; } sbinfo->free_blocks--; inode->i_blocks += BLOCKS_PER_PAGE; @@ -432,31 +473,43 @@ static swp_entry_t *shmem_swp_alloc(stru } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); + page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | + __GFP_UBC); if (page) set_page_private(page, 0); spin_lock(&info->lock); if (!page) { - shmem_free_blocks(inode, 1); - return ERR_PTR(-ENOMEM); + entry = ERR_PTR(-ENOMEM); + goto out_block; } if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { entry = ERR_PTR(-EINVAL); - break; + goto out_dir; } - if (info->next_index <= index) + if (info->next_index <= index) { + ub_val = 0; info->next_index = index + 1; + } } if (page) { /* another task gave its page, or truncated the file */ shmem_free_blocks(inode, 1); shmem_dir_free(page); } - if (info->next_index <= index && !IS_ERR(entry)) + if (info->next_index <= index) info->next_index = index + 1; return entry; + +out_dir: + shmem_dir_free(page); +out_block: + shmem_free_blocks(inode, 1); +out: + if (ub_val) + ub_shmpages_uncharge(info, ub_val); + return entry; } /** @@ -564,6 +617,7 @@ static void shmem_truncate_range(struct return; spin_lock(&info->lock); + ub_shmpages_uncharge(info, info->next_index - idx); info->flags |= SHMEM_TRUNCATE; if (likely(end == (loff_t) -1)) { limit = info->next_index; @@ -750,7 +804,7 @@ done2: info->swapped -= nr_swaps_freed; if (nr_pages_to_free) shmem_free_blocks(inode, nr_pages_to_free); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, nr_swaps_freed); spin_unlock(&info->lock); /* @@ -833,6 +887,7 @@ static void shmem_delete_inode(struct in } } BUG_ON(inode->i_blocks); + shmi_ub_put(info); shmem_free_inode(inode->i_sb); clear_inode(inode); } @@ -1020,6 +1075,12 @@ int shmem_unuse(swp_entry_t entry, struc out: return found; /* 0 or 1 or -ENOMEM */ } +#ifdef CONFIG_BEANCOUNTERS +#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub)) +#else +#define shm_get_swap_page(info) (get_swap_page(NULL)) +#endif + /* * Move the page from the page cache to the swap cache. */ @@ -1051,7 +1112,7 @@ static int shmem_writepage(struct page * * discarded. */ if (wbc->for_reclaim) - swap = get_swap_page(); + swap = shm_get_swap_page(info); else swap.val = 0; @@ -1069,7 +1130,7 @@ static int shmem_writepage(struct page * free_swap_and_cache(*entry); shmem_swp_set(info, entry, 0); } - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, 0); if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { remove_from_page_cache(page); @@ -1252,7 +1313,7 @@ repeat: } spin_lock(&info->lock); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, 0); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) { spin_unlock(&info->lock); @@ -1455,6 +1516,7 @@ repeat: clear_highpage(filepage); flush_dcache_page(filepage); SetPageUptodate(filepage); + ub_tmpfs_respages_inc(info); if (sgp == SGP_DIRTY) set_page_dirty(filepage); } @@ -1512,20 +1574,27 @@ int shmem_lock(struct file *file, int lo spin_lock(&info->lock); if (lock && !(info->flags & VM_LOCKED)) { + if (ub_lockedshm_charge(info, inode->i_size) < 0) + goto out_ch; + if (!user_shm_lock(inode->i_size, user)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && user) { + ub_lockedshm_uncharge(info, inode->i_size); user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); scan_mapping_unevictable_pages(file->f_mapping); } - retval = 0; + spin_unlock(&info->lock); + return 0; out_nomem: + ub_lockedshm_uncharge(info, inode->i_size); +out_ch: spin_unlock(&info->lock); return retval; } @@ -1559,6 +1628,7 @@ static struct inode *shmem_get_inode(str inode->i_generation = get_seconds(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); + shmi_ub_set(info, get_exec_ub()); spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); @@ -2182,7 +2252,7 @@ static int shmem_parse_options(char *opt size = memparse(value,&rest); if (*rest == '%') { size <<= PAGE_SHIFT; - size *= totalram_pages; + size *= tmpfs_ram_pages(); do_div(size, 100); rest++; } @@ -2424,7 +2494,7 @@ static const struct address_space_operat .error_remove_page = generic_error_remove_page, }; -static const struct file_operations shmem_file_operations = { +const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, @@ -2437,6 +2507,7 @@ static const struct file_operations shme .splice_write = generic_file_splice_write, #endif }; +EXPORT_SYMBOL_GPL(shmem_file_operations); static const struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, @@ -2506,6 +2577,10 @@ static const struct vm_operations_struct #endif }; +int is_shmem_mapping(struct address_space *map) +{ + return (map != NULL && map->a_ops == &shmem_aops); +} static int shmem_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) @@ -2513,12 +2588,13 @@ static int shmem_get_sb(struct file_syst return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); } -static struct file_system_type tmpfs_fs_type = { +struct file_system_type tmpfs_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .get_sb = shmem_get_sb, .kill_sb = kill_litter_super, }; +EXPORT_SYMBOL(tmpfs_fs_type); int __init init_tmpfs(void) { @@ -2608,6 +2684,36 @@ int shmem_lock(struct file *file, int lo /* common code */ +static inline int shm_charge_ahead(struct inode *inode) +{ +#ifdef CONFIG_BEANCOUNTERS + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long idx; + swp_entry_t *entry; + + if (!inode->i_size) + return 0; + idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + /* + * Just touch info to allocate space for entry and + * make all UBC checks + */ + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, SGP_CACHE); + if (IS_ERR(entry)) + goto err; + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + return 0; + +err: + spin_unlock(&info->lock); + return PTR_ERR(entry); +#else + return 0; +#endif +} + /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc//maps @@ -2653,6 +2759,9 @@ struct file *shmem_file_setup(const char d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + error = shm_charge_ahead(inode); + if (error) + goto close_file; init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, &shmem_file_operations); @@ -2689,6 +2798,8 @@ int shmem_zero_setup(struct vm_area_stru if (vma->vm_file) fput(vma->vm_file); + else if (vma->vm_flags & VM_WRITE) + __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; diff -urNp linux-2.6.32.48/mm/slab.c linux-2.6.32.48-openvz/mm/slab.c --- linux-2.6.32.48/mm/slab.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/slab.c 2011-11-21 17:40:47.000000000 -0500 @@ -115,30 +115,14 @@ #include #include #include +#include +#include #include #include #include -/* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. - * 0 for faster, smaller code (especially in the critical paths). - * - * STATS - 1 to collect stats for /proc/slabinfo. - * 0 for faster, smaller code (especially in the critical paths). - * - * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) - */ - -#ifdef CONFIG_DEBUG_SLAB -#define DEBUG 1 -#define STATS 1 -#define FORCED_DEBUG 1 -#else -#define DEBUG 0 -#define STATS 0 -#define FORCED_DEBUG 0 -#endif +#include /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) @@ -173,19 +157,21 @@ #endif /* Legal flag mask for kmem_cache_create(). */ -#if DEBUG +#if SLAB_DEBUG # define CREATE_MASK (SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_UBC | SLAB_NO_CHARGE | \ SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_UBC | SLAB_NO_CHARGE | \ SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) #endif @@ -389,12 +375,14 @@ static void kmem_list3_init(struct kmem_ #define REAPTIMEOUT_CPUC (2*HZ) #define REAPTIMEOUT_LIST3 (4*HZ) -#if STATS +#if SLAB_STATS +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) +#define STATS_INC_SHRUNK(x) ((x)->shrunk++) + #define STATS_INC_ACTIVE(x) ((x)->num_active++) #define STATS_DEC_ACTIVE(x) ((x)->num_active--) #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) -#define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) #define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \ @@ -414,11 +402,12 @@ static void kmem_list3_init(struct kmem_ #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) #else +#define STATS_INC_GROWN(x) do { } while (0) +#define STATS_ADD_REAPED(x,y) do { } while (0) +#define STATS_INC_SHRUNK(x) do { } while (0) #define STATS_INC_ACTIVE(x) do { } while (0) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) -#define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -431,7 +420,7 @@ static void kmem_list3_init(struct kmem_ #define STATS_INC_FREEMISS(x) do { } while (0) #endif -#if DEBUG +#if SLAB_DEBUG /* * memory layout of objects: @@ -571,6 +560,8 @@ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, #include CACHE(ULONG_MAX) +#include + CACHE(ULONG_MAX) #undef CACHE }; EXPORT_SYMBOL(malloc_sizes); @@ -584,10 +575,17 @@ struct cache_names { static struct cache_names __initdata cache_names[] = { #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, #include + {NULL,}, +#undef CACHE +#define CACHE(x) { .name = "size-" #x "(UBC)", .name_dma = "size-" #x "(DMA,UBC)" }, +#include {NULL,} #undef CACHE }; +int malloc_cache_num; +EXPORT_SYMBOL(malloc_cache_num); + static struct arraycache_init initarray_cache __initdata = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = @@ -663,7 +661,8 @@ static inline void init_lock_keys(void) * Guard access to the cache-chain. */ static DEFINE_MUTEX(cache_chain_mutex); -static struct list_head cache_chain; +static LIST_HEAD(cache_chain); +static DEFINE_SPINLOCK(cache_chain_lock); /* * chicken and egg problem: delay the per-cpu array allocation @@ -697,7 +696,9 @@ static inline struct kmem_cache *__find_ { struct cache_sizes *csizep = malloc_sizes; -#if DEBUG + if (gfpflags & __GFP_UBC) + csizep += malloc_cache_num; +#if SLAB_DEBUG /* This happens if someone tries to call * kmem_cache_create(), or __kmalloc(), before * the generic caches are initialized. @@ -727,9 +728,102 @@ static struct kmem_cache *kmem_find_gene return __find_general_cachep(size, gfpflags); } -static size_t slab_mgmt_size(size_t nr_objs, size_t align) +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +{ + return (kmem_bufctl_t *) (slabp + 1); +} + +#ifdef CONFIG_BEANCOUNTERS +#define init_slab_ubps(cachep, slabp) do { \ + if (!((cachep)->flags & SLAB_UBC)) \ + break; \ + memset(slab_ubcs(cachep, slabp), 0, \ + (cachep)->num * sizeof(void *)); \ + } while (0) + +#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1) +#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0) +#define set_cache_objuse(cachep) do { \ + (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \ + (cachep)->num - 1) / (cachep)->num; \ + if (!OFF_SLAB(cachep)) \ + break; \ + (cachep)->objuse += ((cachep)->slabp_cache->objuse + \ + (cachep)->num - 1) / (cachep)->num; \ + } while (0) + +void kmem_mark_nocharge(struct kmem_cache *cachep) +{ + cachep->flags |= SLAB_NO_CHARGE; +} + +int kmem_cache_objuse(struct kmem_cache *cachep) +{ + return cachep->objuse; +} + +EXPORT_SYMBOL(kmem_cache_objuse); + +int kmem_obj_objuse(void *obj) +{ + return virt_to_cache(obj)->objuse; +} + +int kmem_dname_objuse(void *obj) +{ + return virt_to_cache(obj)->objuse; +} + +unsigned long ub_cache_growth(struct kmem_cache *cachep) +{ +#if SLAB_STATS + return (cachep->grown - cachep->reaped - cachep->shrunk) + << cachep->gfporder; +#else + return 0; +#endif +} + +#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\ + (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\ + sizeof(void *)))) + +struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj) +{ + struct slab *slabp; + int objnr; + + BUG_ON(!(cachep->flags & SLAB_UBC)); + slabp = virt_to_slab(obj); + objnr = (obj - slabp->s_mem) / cachep->buffer_size; + return slab_ubcs(cachep, slabp) + objnr; +} + +struct user_beancounter *slab_ub(void *obj) +{ + return *ub_slab_ptr(virt_to_cache(obj), obj); +} + +EXPORT_SYMBOL(slab_ub); + +#else +#define UB_ALIGN(flags) 1 +#define UB_EXTRA(flags) 0 +#define set_cache_objuse(c) do { } while (0) +#define init_slab_ubps(c, s) do { } while (0) +#endif + +static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags) +{ + size_t size_noub; + + size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t); + return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags); +} + +static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags) { - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); + return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align); } /* @@ -774,20 +868,23 @@ static void cache_estimate(unsigned long * into account. */ nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(kmem_bufctl_t)); + (buffer_size + sizeof(kmem_bufctl_t) + + UB_EXTRA(flags)); /* * This calculated number will be either the right * amount, or one greater than what we want. */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) + if (slab_mgmt_size(nr_objs, align, flags) + + nr_objs * buffer_size > slab_size) nr_objs--; + BUG_ON(slab_mgmt_size(nr_objs, align, flags) + + nr_objs * buffer_size > slab_size); if (nr_objs > SLAB_LIMIT) nr_objs = SLAB_LIMIT; - mgmt_size = slab_mgmt_size(nr_objs, align); + mgmt_size = slab_mgmt_size(nr_objs, align, flags); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -1338,6 +1435,7 @@ static void init_list(struct kmem_cache MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->nodelists[nodeid] = ptr; } +static int offslab_limit; /* * For setting up all the kmem_list3s for cache whose buffer_size is same as @@ -1408,7 +1506,6 @@ void __init kmem_cache_init(void) node = numa_node_id(); /* 1) create the cache_cache */ - INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; @@ -1420,7 +1517,7 @@ void __init kmem_cache_init(void) */ cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + nr_node_ids * sizeof(struct kmem_list3 *); -#if DEBUG +#if SLAB_DEBUG cache_cache.obj_size = cache_cache.buffer_size; #endif cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, @@ -1467,6 +1564,7 @@ void __init kmem_cache_init(void) slab_early_init = 0; + for (i = 0; i < 2; i++) { while (sizes->cs_size != ULONG_MAX) { /* * For performance, all the general caches are L1 aligned. @@ -1479,21 +1577,30 @@ void __init kmem_cache_init(void) sizes->cs_cachep = kmem_cache_create(names->name, sizes->cs_size, ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, + ARCH_KMALLOC_FLAGS|SLAB_PANIC| + (i ? SLAB_UBC : 0)|SLAB_NO_CHARGE, NULL); } + if (!(OFF_SLAB(sizes->cs_cachep))) + offslab_limit = sizes->cs_size; #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = kmem_cache_create( - names->name_dma, + sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| + (i ? SLAB_UBC : 0) | SLAB_NO_CHARGE| SLAB_PANIC, NULL); #endif sizes++; names++; } + + sizes++; + names++; + if (!i) + malloc_cache_num = sizes - malloc_sizes; + } /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; @@ -1674,7 +1781,7 @@ static void kmem_rcu_free(struct rcu_hea kmem_cache_free(cachep->slabp_cache, slab_rcu); } -#if DEBUG +#if SLAB_DEBUG #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1751,7 +1858,7 @@ static void dump_line(char *data, int of } #endif -#if DEBUG +#if SLAB_DEBUG static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) { @@ -1844,7 +1951,7 @@ static void check_poison_obj(struct kmem } #endif -#if DEBUG +#if SLAB_DEBUG static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { int i; @@ -1944,7 +2051,6 @@ static void __kmem_cache_destroy(struct static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1957,15 +2063,10 @@ static size_t calculate_slab_order(struc continue; if (flags & CFLGS_OFF_SLAB) { - /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). - */ - offslab_limit = size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); + int slab_size; - if (num > offslab_limit) + slab_size = slab_mgmt_size_noalign(num, flags); + if (slab_size > offslab_limit) break; } @@ -2133,9 +2234,9 @@ kmem_cache_create (const char *name, siz } } -#if DEBUG +#if SLAB_DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ -#if FORCED_DEBUG +#if SLAB_FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size @@ -2225,7 +2326,7 @@ kmem_cache_create (const char *name, siz if (!cachep) goto oops; -#if DEBUG +#if SLAB_DEBUG cachep->obj_size = size; /* @@ -2247,7 +2348,7 @@ kmem_cache_create (const char *name, siz else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) +#if SLAB_FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); @@ -2279,8 +2380,7 @@ kmem_cache_create (const char *name, siz cachep = NULL; goto oops; } - slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + slab_size = slab_mgmt_size(cachep->num, align, flags); /* * If the slab has been placed off-slab, and we have enough space then @@ -2293,8 +2393,7 @@ kmem_cache_create (const char *name, siz if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = - cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + slab_size = slab_mgmt_size_noalign(cachep->num, flags); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2340,7 +2439,10 @@ kmem_cache_create (const char *name, siz } /* cache setup completed, link it into the list */ + spin_lock(&cache_chain_lock); list_add(&cachep->next, &cache_chain); + spin_unlock(&cache_chain_lock); + set_cache_objuse(cachep); oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", @@ -2353,7 +2455,7 @@ oops: } EXPORT_SYMBOL(kmem_cache_create); -#if DEBUG +#if SLAB_DEBUG static void check_irq_off(void) { BUG_ON(!irqs_disabled()); @@ -2449,10 +2551,11 @@ static int drain_freelist(struct kmem_ca } slabp = list_entry(p, struct slab, list); -#if DEBUG +#if SLAB_DEBUG BUG_ON(slabp->inuse); #endif list_del(&slabp->list); + STATS_INC_SHRUNK(cache); /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2535,10 +2638,14 @@ void kmem_cache_destroy(struct kmem_cach /* * the chain is never empty, cache_cache is never destroyed */ + spin_lock(&cache_chain_lock); list_del(&cachep->next); + spin_unlock(&cache_chain_lock); if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); + spin_lock(&cache_chain_lock); list_add(&cachep->next, &cache_chain); + spin_unlock(&cache_chain_lock); mutex_unlock(&cache_chain_mutex); put_online_cpus(); return; @@ -2547,6 +2654,8 @@ void kmem_cache_destroy(struct kmem_cach if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) rcu_barrier(); + + ub_kmemcache_free(cachep); __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); put_online_cpus(); @@ -2573,7 +2682,7 @@ static struct slab *alloc_slabmgmt(struc if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, - local_flags, nodeid); + (local_flags & ~__GFP_UBC), nodeid); /* * If the first object in the slab is leaked (it's allocated * but no one has a reference to it), we want to make sure @@ -2593,14 +2702,10 @@ static struct slab *alloc_slabmgmt(struc slabp->s_mem = objp + colour_off; slabp->nodeid = nodeid; slabp->free = 0; + init_slab_ubps(cachep, slabp); return slabp; } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) -{ - return (kmem_bufctl_t *) (slabp + 1); -} - static void cache_init_objs(struct kmem_cache *cachep, struct slab *slabp) { @@ -2608,7 +2713,7 @@ static void cache_init_objs(struct kmem_ for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, slabp, i); -#if DEBUG +#if SLAB_DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) poison_obj(cachep, objp, POISON_FREE); @@ -2666,7 +2771,7 @@ static void *slab_get_obj(struct kmem_ca slabp->inuse++; next = slab_bufctl(slabp)[slabp->free]; -#if DEBUG +#if SLAB_DEBUG slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; WARN_ON(slabp->nodeid != nodeid); #endif @@ -2680,7 +2785,7 @@ static void slab_put_obj(struct kmem_cac { unsigned int objnr = obj_to_index(cachep, slabp, objp); -#if DEBUG +#if SLAB_DEBUG /* Verify that the slab belongs to the intended node */ WARN_ON(slabp->nodeid != nodeid); @@ -2768,7 +2873,7 @@ static int cache_grow(struct kmem_cache * 'nodeid'. */ if (!objp) - objp = kmem_getpages(cachep, local_flags, nodeid); + objp = kmem_getpages(cachep, local_flags & ~__GFP_UBC, nodeid); if (!objp) goto failed; @@ -2801,7 +2906,7 @@ failed: return 0; } -#if DEBUG +#if SLAB_DEBUG /* * Perform extra freeing checks: @@ -3014,12 +3119,12 @@ static inline void cache_alloc_debugchec gfp_t flags) { might_sleep_if(flags & __GFP_WAIT); -#if DEBUG +#if SLAB_DEBUG kmem_flagcheck(cachep, flags); #endif } -#if DEBUG +#if SLAB_DEBUG static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, void *caller) { @@ -3389,11 +3494,16 @@ __cache_alloc(struct kmem_cache *cachep, cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, flags); prefetchw(objp); + if (objp && should_charge(cachep->flags, flags) && + ub_slab_charge(cachep, objp, flags)) { + kmem_cache_free(cachep, objp); + objp = NULL; + } + local_irq_restore(save_flags); if (likely(objp)) kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); @@ -3430,6 +3540,7 @@ static void free_block(struct kmem_cache /* fixup slab chains */ if (slabp->inuse == 0) { if (l3->free_objects > l3->free_limit) { + STATS_INC_SHRUNK(cachep); l3->free_objects -= cachep->num; /* No need to drop any previously held * lock here, even if we have a off-slab slab @@ -3458,7 +3569,7 @@ static void cache_flusharray(struct kmem int node = numa_node_id(); batchcount = ac->batchcount; -#if DEBUG +#if SLAB_DEBUG BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); @@ -3479,7 +3590,7 @@ static void cache_flusharray(struct kmem free_block(cachep, ac->entry, batchcount, node); free_done: -#if STATS +#if SLAB_STATS { int i = 0; struct list_head *p; @@ -3516,6 +3627,9 @@ static inline void __cache_free(struct k kmemcheck_slab_free(cachep, objp, obj_size(cachep)); + if (should_uncharge(cachep->flags)) + ub_slab_uncharge(cachep, objp); + /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which @@ -3970,7 +4084,7 @@ static int enable_cpucache(struct kmem_c if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; -#if DEBUG +#if SLAB_DEBUG /* * With debugging enabled, large batchcount lead to excessively long * periods with disabled local interrupts. Limit the batchcount @@ -4037,6 +4151,7 @@ static void cache_reap(struct work_struc /* Give up. Setup the next iteration. */ goto out; + {KSTAT_PERF_ENTER(cache_reap) list_for_each_entry(searchp, &cache_chain, next) { check_irq_on(); @@ -4077,6 +4192,7 @@ next: check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); + KSTAT_PERF_LEAVE(cache_reap)} out: /* Set up the next iteration */ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); @@ -4090,7 +4206,7 @@ static void print_slabinfo_header(struct * Output format version, so at least we can change it * without _too_ many complaints. */ -#if STATS +#if SLAB_STATS seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); #else seq_puts(m, "slabinfo - version: 2.1\n"); @@ -4099,14 +4215,82 @@ static void print_slabinfo_header(struct " "); seq_puts(m, " : tunables "); seq_puts(m, " : slabdata "); -#if STATS +#if SLAB_STATS seq_puts(m, " : globalstat " - " "); + " "); seq_puts(m, " : cpustat "); #endif seq_putc(m, '\n'); } +#define SHOW_TOP_SLABS 10 + +static unsigned long get_cache_size(struct kmem_cache *cachep) +{ + unsigned long flags; + unsigned long slabs; + struct kmem_list3 *l3; + struct list_head *lh; + int node; + + slabs = 0; + + for_each_online_node (node) { + l3 = cachep->nodelists[node]; + if (l3 == NULL) + continue; + + spin_lock_irqsave(&l3->list_lock, flags); + list_for_each (lh, &l3->slabs_full) + slabs++; + list_for_each (lh, &l3->slabs_partial) + slabs++; + list_for_each (lh, &l3->slabs_free) + slabs++; + spin_unlock_irqrestore(&l3->list_lock, flags); + } + + return slabs * (PAGE_SIZE << cachep->gfporder) + + (OFF_SLAB(cachep) ? + cachep->slabp_cache->buffer_size * slabs : 0); +} + +void show_slab_info(void) +{ + int i, j; + unsigned long size; + struct kmem_cache *ptr; + unsigned long sizes[SHOW_TOP_SLABS]; + struct kmem_cache *top[SHOW_TOP_SLABS]; + + memset(top, 0, sizeof(top)); + memset(sizes, 0, sizeof(sizes)); + + printk("Top %d caches:\n", SHOW_TOP_SLABS); + + spin_lock(&cache_chain_lock); + list_for_each_entry (ptr, &cache_chain, next) { + size = get_cache_size(ptr); + + j = 0; + for (i = 1; i < SHOW_TOP_SLABS; i++) + if (sizes[i] < sizes[j]) + j = i; + + if (size > sizes[j]) { + sizes[j] = size; + top[j] = ptr; + } + } + + for (i = 0; i < SHOW_TOP_SLABS; i++) + if (top[i]) + printk("%-21s: size %10lu objsize %10u\n", + top[i]->name, sizes[i], + top[i]->buffer_size); + spin_unlock(&cache_chain_lock); +} + static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; @@ -4185,19 +4369,20 @@ static int s_show(struct seq_file *m, vo if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + seq_printf(m, "%-21s %6lu %6lu %6u %4u %4d", name, active_objs, num_objs, cachep->buffer_size, cachep->num, (1 << cachep->gfporder)); seq_printf(m, " : tunables %4u %4u %4u", cachep->limit, cachep->batchcount, cachep->shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", active_slabs, num_slabs, shared_avail); -#if STATS +#if SLAB_STATS { /* list3 stats */ unsigned long high = cachep->high_mark; unsigned long allocs = cachep->num_allocations; unsigned long grown = cachep->grown; unsigned long reaped = cachep->reaped; + unsigned long shrunk = cachep->shrunk; unsigned long errors = cachep->errors; unsigned long max_freeable = cachep->max_freeable; unsigned long node_allocs = cachep->node_allocs; @@ -4205,9 +4390,10 @@ static int s_show(struct seq_file *m, vo unsigned long overflows = cachep->node_overflow; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, + %4lu %4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, reaped, errors, max_freeable, node_allocs, - node_frees, overflows); + node_frees, overflows, shrunk); } /* cpu stats */ { diff -urNp linux-2.6.32.48/mm/slub.c linux-2.6.32.48-openvz/mm/slub.c --- linux-2.6.32.48/mm/slub.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/slub.c 2011-11-21 17:40:47.000000000 -0500 @@ -29,6 +29,8 @@ #include #include +#include + /* * Lock order: * 1. slab_lock(page) @@ -149,9 +151,11 @@ /* * Set of flags that will prevent slab merging + * + * FIXME - think over how to allow merging accountable slubs */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | SLAB_UBC) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA | SLAB_NOTRACK) @@ -201,6 +205,8 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; +static DEFINE_SPINLOCK(cache_chain_lock); + #ifdef CONFIG_SLUB_DEBUG static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); @@ -321,6 +327,90 @@ static inline int oo_objects(struct kmem return x.x & OO_MASK; } +#ifdef CONFIG_BEANCOUNTERS +static inline void inc_cache_grown(struct kmem_cache *s) +{ + atomic_inc(&s->grown); +} + +static inline void dec_cache_grown(struct kmem_cache *s) +{ + atomic_dec(&s->grown); +} + +unsigned long ub_cache_growth(struct kmem_cache *cachep) +{ + return atomic_read(&cachep->grown) << cachep->oo.x; /* XXX huh? */ +} + +static void __flush_cpu_slab(struct kmem_cache *s, int cpu); + +int kmem_cache_objuse(struct kmem_cache *cachep) +{ + return cachep->objuse; +} + +EXPORT_SYMBOL(kmem_cache_objuse); + +int kmem_obj_objuse(void *obj) +{ + return kmem_cache_objuse(virt_to_head_page(obj)->slab); +} + +EXPORT_SYMBOL(kmem_obj_objuse); + +int kmem_dname_objuse(void *obj) +{ + struct kmem_cache *s; + + /* + * Allocations larger than PAGE_SIZE/2 go directly through + * __get_free_pages() and aren't associated with any cache. + */ + s = virt_to_head_page(obj)->slab; + if (!s) + return PAGE_SIZE; + return kmem_cache_objuse(s); +} + +#define page_ubs(pg) (pg->bc.slub_ubs) + +struct user_beancounter **ub_slab_ptr(struct kmem_cache *s, void *obj) +{ + struct page *pg; + + BUG_ON(!(s->flags & SLAB_UBC)); + pg = virt_to_head_page(obj); + return page_ubs(pg) + slab_index(obj, s, page_address(pg)); +} + +EXPORT_SYMBOL(ub_slab_ptr); + +struct user_beancounter *slab_ub(void *obj) +{ + struct page *pg; + + pg = virt_to_head_page(obj); + BUG_ON(!(pg->slab->flags & SLAB_UBC)); + return page_ubs(pg)[slab_index(obj, pg->slab, page_address(pg))]; +} + +EXPORT_SYMBOL(slab_ub); + +void kmem_mark_nocharge(struct kmem_cache *cachep) +{ + cachep->flags |= SLAB_NO_CHARGE; +} +#else +static inline void inc_cache_grown(struct kmem_cache *s) +{ +} + +static inline void dec_cache_grown(struct kmem_cache *s) +{ +} +#endif + #ifdef CONFIG_SLUB_DEBUG /* * Debug settings: @@ -1105,6 +1195,7 @@ static struct page *allocate_slab(struct struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + flags &= ~__GFP_UBC; flags |= s->allocflags; /* @@ -1149,9 +1240,12 @@ static struct page *allocate_slab(struct NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << oo_order(oo)); + inc_cache_grown(s); return page; } +static void __free_slab(struct kmem_cache *s, struct page *page); + static void setup_object(struct kmem_cache *s, struct page *page, void *object) { @@ -1174,6 +1268,18 @@ static struct page *new_slab(struct kmem if (!page) goto out; +#ifdef CONFIG_BEANCOUNTERS + if (s->flags & SLAB_UBC) { + BUG_ON(page_ubs(page) != NULL); + page_ubs(page) = kzalloc(page->objects * sizeof(void *), + flags & ~__GFP_UBC); + if (page_ubs(page) == NULL) { + __free_slab(s, page); + page = NULL; + goto out; + } + } +#endif inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; @@ -1225,6 +1331,13 @@ static void __free_slab(struct kmem_cach __ClearPageSlab(page); reset_page_mapcount(page); +#ifdef CONFIG_BEANCOUNTERS + if (page_ubs(page) != NULL) { + BUG_ON(!(s->flags & SLAB_UBC)); + kfree(page_ubs(page)); + page_ubs(page) = NULL; + } +#endif if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; __free_pages(page, order); @@ -1249,6 +1362,8 @@ static void free_slab(struct kmem_cache call_rcu(head, rcu_free_slab); } else __free_slab(s, page); + + dec_cache_grown(s); } static void discard_slab(struct kmem_cache *s, struct page *page) @@ -1733,6 +1848,13 @@ static __always_inline void *slab_alloc( c->freelist = object[c->offset]; stat(c, ALLOC_FASTPATH); } + + if (object && should_charge(s->flags, gfpflags) && + ub_slab_charge(s, object, gfpflags)) { + kmem_cache_free(s, object); + object = NULL; + } + local_irq_restore(flags); if (unlikely((gfpflags & __GFP_ZERO) && object)) @@ -1875,6 +1997,9 @@ static __always_inline void slab_free(st c = get_cpu_slab(s, smp_processor_id()); kmemcheck_slab_free(s, object, c->objsize); debug_check_no_locks_freed(object, c->objsize); + + if (should_uncharge(s->flags)) + ub_slab_uncharge(s, x); if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(object, c->objsize); if (likely(page == c->page && c->node >= 0)) { @@ -2497,6 +2622,9 @@ static int kmem_cache_open(struct kmem_c #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif +#ifdef CONFIG_BEANCOUNTERS + s->objuse = s->size + (sizeof(struct page) / oo_objects(s->oo)); +#endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -2630,9 +2758,11 @@ static inline int kmem_cache_close(struc void kmem_cache_destroy(struct kmem_cache *s) { down_write(&slub_lock); + spin_lock(&cache_chain_lock); s->refcount--; if (!s->refcount) { list_del(&s->list); + spin_unlock(&cache_chain_lock); up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " @@ -2642,8 +2772,10 @@ void kmem_cache_destroy(struct kmem_cach if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); sysfs_slab_remove(s); - } else + } else { + spin_unlock(&cache_chain_lock); up_write(&slub_lock); + } } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2653,6 +2785,10 @@ EXPORT_SYMBOL(kmem_cache_destroy); struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); +#ifdef CONFIG_BEANCOUNTERS +struct kmem_cache ub_kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; +EXPORT_SYMBOL(ub_kmalloc_caches); +#endif static int __init setup_slub_min_order(char *str) { @@ -2695,6 +2831,11 @@ static struct kmem_cache *create_kmalloc { unsigned int flags = 0; + if (gfp_flags & __GFP_UBC) { + flags = SLAB_UBC | SLAB_NO_CHARGE; + gfp_flags &= ~__GFP_UBC; + } + if (gfp_flags & SLUB_DMA) flags = SLAB_CACHE_DMA; @@ -2706,7 +2847,9 @@ static struct kmem_cache *create_kmalloc flags, NULL)) goto panic; + spin_lock(&cache_chain_lock); list_add(&s->list, &slab_caches); + spin_unlock(&cache_chain_lock); if (sysfs_slab_add(s)) goto panic; @@ -2779,7 +2922,9 @@ static noinline struct kmem_cache *dma_k goto unlock_out; } + spin_lock(&cache_chain_lock); list_add(&s->list, &slab_caches); + spin_unlock(&cache_chain_lock); kmalloc_caches_dma[index] = s; if (slab_state >= SYSFS) @@ -2843,11 +2988,14 @@ static struct kmem_cache *get_slab(size_ index = fls(size - 1); #ifdef CONFIG_ZONE_DMA - if (unlikely((flags & SLUB_DMA))) + if (unlikely((flags & SLUB_DMA))) { + BUG_ON(flags & __GFP_UBC); return dma_kmalloc_cache(index, flags); + } #endif - return &kmalloc_caches[index]; + + return __kmalloc_cache(flags, index); } void *__kmalloc(size_t size, gfp_t flags) @@ -3187,6 +3335,11 @@ void __init kmem_cache_init(void) create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", sizeof(struct kmem_cache_node), GFP_NOWAIT); kmalloc_caches[0].refcount = -1; +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[0], "kmem_cache_node_ubc", + sizeof(struct kmem_cache_node), GFP_NOWAIT | __GFP_UBC); + ub_kmalloc_caches[0].refcount = -1; +#endif caches++; hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -3199,17 +3352,29 @@ void __init kmem_cache_init(void) if (KMALLOC_MIN_SIZE <= 32) { create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_NOWAIT); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[1], + "kmalloc-96-ubc", 96, GFP_NOWAIT | __GFP_UBC); +#endif caches++; } if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_NOWAIT); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[2], + "kmalloc-192-ubc", 192, GFP_NOWAIT | __GFP_UBC); +#endif caches++; } for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_NOWAIT); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[i], + "kmalloc-ubc", 1 << i, GFP_NOWAIT | __GFP_UBC); +#endif caches++; } @@ -3255,9 +3420,14 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { kmalloc_caches[i]. name = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); +#ifdef CONFIG_BEANCOUNTERS + ub_kmalloc_caches[i].name = + kasprintf(GFP_NOWAIT | __GFP_UBC, "kmalloc-%d-ubc", 1 << i); +#endif + } #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); @@ -3383,11 +3553,15 @@ struct kmem_cache *kmem_cache_create(con if (s) { if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { + spin_lock(&cache_chain_lock); list_add(&s->list, &slab_caches); + spin_unlock(&cache_chain_lock); up_write(&slub_lock); if (sysfs_slab_add(s)) { down_write(&slub_lock); + spin_lock(&cache_chain_lock); list_del(&s->list); + spin_unlock(&cache_chain_lock); up_write(&slub_lock); kfree(s); goto err; @@ -4555,6 +4729,8 @@ static char *create_unique_id(struct kme *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (s->flags & SLAB_UBC) + *p++ = 'b'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; if (p != name + 1) @@ -4707,6 +4883,76 @@ static void print_slabinfo_header(struct seq_putc(m, '\n'); } +#define SHOW_TOP_SLABS 10 + +static unsigned long get_cache_size(struct kmem_cache *cache) +{ + unsigned long flags; + unsigned long slabs; + struct kmem_cache_node *n; + struct list_head *lh; + int cpu, node; + + slabs = 0; + + for_each_online_cpu(cpu) + slabs++; + + for_each_online_node(node) { + n = get_node(cache, node); + if (!n) + continue; + spin_lock_irqsave(&n->list_lock, flags); +#ifdef CONFIG_SLUB_DEBUG + list_for_each(lh, &n->full) + slabs++; +#endif + list_for_each(lh, &n->partial) + slabs++; + spin_unlock_irqrestore(&n->list_lock, flags); + } + + return slabs * (PAGE_SIZE << oo_order(cache->oo)); +} + +void show_slab_info(void) +{ + int i, j; + unsigned long size; + struct kmem_cache *ptr; + unsigned long sizes[SHOW_TOP_SLABS]; + struct kmem_cache *top[SHOW_TOP_SLABS]; + + memset(top, 0, sizeof(top)); + memset(sizes, 0, sizeof(sizes)); + + printk("Top %d caches:\n", SHOW_TOP_SLABS); + + spin_lock(&cache_chain_lock); + list_for_each_entry(ptr, &slab_caches, list) { + size = get_cache_size(ptr); + + j = 0; + for (i = 1; i < SHOW_TOP_SLABS; i++) { + if (sizes[i] < sizes[j]) + j = i; + } + if (size > sizes[j]) { + sizes[j] = size; + top[j] = ptr; + } + } + + for (i = 0; i < SHOW_TOP_SLABS; i++) { + if (top[i]) + printk("%-21s: size %10lu objsize %10u\n", + top[i]->name, sizes[i], + top[i]->size); + } + + spin_unlock(&cache_chain_lock); +} + static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; diff -urNp linux-2.6.32.48/mm/swapfile.c linux-2.6.32.48-openvz/mm/swapfile.c --- linux-2.6.32.48/mm/swapfile.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/swapfile.c 2011-11-21 17:40:47.000000000 -0500 @@ -35,6 +35,8 @@ #include #include +#include + static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; @@ -47,9 +49,13 @@ static const char Unused_file[] = "Unuse static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct swap_info[MAX_SWAPFILES]; +struct swap_info_struct swap_info[MAX_SWAPFILES]; +EXPORT_SYMBOL(total_swap_pages); +EXPORT_SYMBOL(swap_lock); +EXPORT_SYMBOL(swap_list); +EXPORT_SYMBOL(swap_info); static DEFINE_MUTEX(swapon_mutex); @@ -456,7 +462,7 @@ no_page: return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct user_beancounter *ub) { struct swap_info_struct *si; pgoff_t offset; @@ -477,6 +483,8 @@ swp_entry_t get_swap_page(void) wrapped++; } + if (si->flags & SWP_READONLY) + continue; if (!si->highest_bit) continue; if (!(si->flags & SWP_WRITEOK)) @@ -487,6 +495,7 @@ swp_entry_t get_swap_page(void) offset = scan_swap_map(si, SWAP_CACHE); if (offset) { spin_unlock(&swap_lock); + ub_swapentry_inc(si, offset, ub); return swp_entry(type, offset); } next = swap_list.next; @@ -498,6 +507,8 @@ noswap: return (swp_entry_t) {0}; } +EXPORT_SYMBOL(get_swap_page); + /* The only caller of this function is now susupend routine */ swp_entry_t get_swap_page_of_type(int type) { @@ -506,7 +517,7 @@ swp_entry_t get_swap_page_of_type(int ty spin_lock(&swap_lock); si = swap_info + type; - if (si->flags & SWP_WRITEOK) { + if (si->flags & SWP_WRITEOK && !(si->flags & SWP_READONLY)) { nr_swap_pages--; /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, SWAP_MAP); @@ -579,6 +590,7 @@ static int swap_entry_free(struct swap_i count = p->swap_map[offset]; /* free if no reference */ if (!count) { + ub_swapentry_dec(p, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -608,6 +620,8 @@ void swap_free(swp_entry_t entry) } } +EXPORT_SYMBOL(swap_free); + /* * Called after dropping swapcache to decrease refcnt to swap entries. */ @@ -692,6 +706,25 @@ int try_to_free_swap(struct page *page) return 1; } +int swap_readonly(struct page *page) +{ + swp_entry_t entry; + struct swap_info_struct *p; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p == NULL) + return 0; + + spin_unlock(&swap_lock); + if ((p->flags & (SWP_USED|SWP_WRITEOK|SWP_READONLY)) == + (SWP_USED|SWP_WRITEOK)) + return 0; + + return 1; +} + + /* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. @@ -730,6 +763,7 @@ int free_swap_and_cache(swp_entry_t entr } return p != NULL; } +EXPORT_SYMBOL(free_swap_and_cache); #ifdef CONFIG_HIBERNATION /* @@ -813,12 +847,14 @@ unsigned int count_swap_pages(int type, * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { struct mem_cgroup *ptr = NULL; spinlock_t *ptl; pte_t *pte; int ret = 1; + struct mm_struct *mm = vma->vm_mm; if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { ret = -ENOMEM; @@ -833,9 +869,11 @@ static int unuse_pte(struct vm_area_stru goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + inc_mm_counter(mm, anon_rss); + ub_unused_privvm_dec(mm, vma); + pb_add_ref(page, mm, pb); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, + set_pte_at(mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); mem_cgroup_commit_charge_swapin(page, ptr); @@ -853,7 +891,8 @@ out_nolock: static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; @@ -876,7 +915,7 @@ static int unuse_pte_range(struct vm_are */ if (unlikely(pte_same(*pte, swp_pte))) { pte_unmap(pte); - ret = unuse_pte(vma, pmd, addr, entry, page); + ret = unuse_pte(vma, pmd, addr, entry, page, pb); if (ret) goto out; pte = pte_offset_map(pmd, addr); @@ -889,7 +928,8 @@ out: static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pmd_t *pmd; unsigned long next; @@ -900,7 +940,7 @@ static inline int unuse_pmd_range(struct next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + ret = unuse_pte_range(vma, pmd, addr, next, entry, page, pb); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -909,7 +949,8 @@ static inline int unuse_pmd_range(struct static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pud_t *pud; unsigned long next; @@ -920,7 +961,7 @@ static inline int unuse_pud_range(struct next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + ret = unuse_pmd_range(vma, pud, addr, next, entry, page, pb); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -928,7 +969,8 @@ static inline int unuse_pud_range(struct } static int unuse_vma(struct vm_area_struct *vma, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pgd_t *pgd; unsigned long addr, end, next; @@ -950,7 +992,7 @@ static int unuse_vma(struct vm_area_stru next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + ret = unuse_pud_range(vma, pgd, addr, next, entry, page, pb); if (ret) return ret; } while (pgd++, addr = next, addr != end); @@ -958,7 +1000,8 @@ static int unuse_vma(struct vm_area_stru } static int unuse_mm(struct mm_struct *mm, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { struct vm_area_struct *vma; int ret = 0; @@ -974,7 +1017,7 @@ static int unuse_mm(struct mm_struct *mm lock_page(page); } for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) + if (vma->anon_vma && (ret = unuse_vma(vma, entry, page, pb))) break; } up_read(&mm->mmap_sem); @@ -1036,6 +1079,7 @@ static int try_to_unuse(unsigned int typ int retval = 0; int reset_overflow = 0; int shmem; + struct page_beancounter *pb; /* * When searching mms for an entry, a good strategy is to @@ -1088,6 +1132,13 @@ static int try_to_unuse(unsigned int typ break; } + pb = NULL; + if (pb_alloc_all(&pb)) { + page_cache_release(page); + retval = -ENOMEM; + break; + } + /* * Don't hold on to start_mm if it looks like exiting. */ @@ -1110,6 +1161,20 @@ static int try_to_unuse(unsigned int typ lock_page(page); wait_on_page_writeback(page); + /* If read failed we cannot map not-uptodate page to + * user space. Actually, we are in serious troubles, + * we do not even know what process to kill. So, the only + * variant remains: to stop swapoff() and allow someone + * to kill processes to zap invalid pages. + */ + if (unlikely(!PageUptodate(page))) { + pb_free_list(&pb); + unlock_page(page); + page_cache_release(page); + retval = -EIO; + break; + } + /* * Remove all references to entry. * Whenever we reach init_mm, there's no address space @@ -1121,7 +1186,7 @@ static int try_to_unuse(unsigned int typ if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else - retval = unuse_mm(start_mm, entry, page); + retval = unuse_mm(start_mm, entry, page, &pb); } if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); @@ -1151,7 +1216,7 @@ static int try_to_unuse(unsigned int typ set_start_mm = 1; shmem = shmem_unuse(entry, page); } else - retval = unuse_mm(mm, entry, page); + retval = unuse_mm(mm, entry, page, &pb); if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); @@ -1173,6 +1238,8 @@ static int try_to_unuse(unsigned int typ retval = shmem; break; } + + pb_free_list(&pb); if (retval) { unlock_page(page); page_cache_release(page); @@ -1520,6 +1587,10 @@ SYSCALL_DEFINE1(swapoff, const char __us int i, type, prev; int err; + /* VE admin check is just to be on the safe side, the admin may affect + * swaps only if he has access to special, i.e. if he has been granted + * access to the block device or if the swap file is in the area + * visible to him. */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1629,6 +1700,7 @@ SYSCALL_DEFINE1(swapoff, const char __us spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + ub_swap_fini(p); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -1651,6 +1723,8 @@ out: return err; } +EXPORT_SYMBOL(sys_swapoff); + #ifdef CONFIG_PROC_FS /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) @@ -1731,21 +1805,55 @@ static const struct seq_operations swaps .show = swap_show }; +#include + +static int swap_show_ve(struct seq_file *swap, void *v) +{ + struct meminfo mi; + + memset(&mi, 0, sizeof(mi)); + si_swapinfo(&mi.si); + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) + & NOTIFY_FAIL) + goto out; + + seq_printf(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + if (!mi.si.totalswap) + goto out; + seq_printf(swap, "%-40s%s\t%lu\t%lu\t%d\n", + "/dev/null", + "partition", + mi.si.totalswap << (PAGE_SHIFT - 10), + (mi.si.totalswap - mi.si.freeswap) << (PAGE_SHIFT - 10), + -1); +out: + return 0; +} + static int swaps_open(struct inode *inode, struct file *file) { + if (!ve_is_super(get_exec_env())) + return single_open(file, &swap_show_ve, NULL); return seq_open(file, &swaps_op); } +static int swaps_release(struct inode *inode, struct file *file) +{ + if (!ve_is_super(file->owner_env)) + return single_release(inode, file); + return seq_release(inode, file); +} + static const struct file_operations proc_swaps_operations = { .open = swaps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = swaps_release, }; static int __init procswaps_init(void) { - proc_create("swaps", 0, NULL, &proc_swaps_operations); + proc_create("swaps", 0, &glob_proc_root, &proc_swaps_operations); return 0; } __initcall(procswaps_init); @@ -1975,6 +2083,11 @@ SYSCALL_DEFINE2(swapon, const char __use goto bad_swap; } + if (ub_swap_init(p, maxpages)) { + error = -ENOMEM; + goto bad_swap; + } + if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { p->flags |= SWP_SOLIDSTATE; @@ -1993,6 +2106,8 @@ SYSCALL_DEFINE2(swapon, const char __use p->prio = --least_priority; p->swap_map = swap_map; p->flags |= SWP_WRITEOK; + if (swap_flags & SWAP_FLAG_READONLY) + p->flags |= SWP_READONLY; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; @@ -2051,6 +2166,8 @@ out: return error; } +EXPORT_SYMBOL(sys_swapon); + void si_swapinfo(struct sysinfo *val) { unsigned int i; @@ -2148,6 +2265,8 @@ void swap_duplicate(swp_entry_t entry) __swap_duplicate(entry, SWAP_MAP); } +EXPORT_SYMBOL(swap_duplicate); + /* * @entry: swap entry for which we allocate swap cache. * diff -urNp linux-2.6.32.48/mm/swap_state.c linux-2.6.32.48-openvz/mm/swap_state.c --- linux-2.6.32.48/mm/swap_state.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/swap_state.c 2011-11-21 17:40:47.000000000 -0500 @@ -21,6 +21,9 @@ #include +#include +#include + /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_page_list, to make sync_page look nicer, and to allow @@ -46,6 +49,7 @@ struct address_space swapper_space = { .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, }; +EXPORT_SYMBOL(swapper_space); #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -70,7 +74,7 @@ void show_swap_cache_info(void) * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -static int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -119,6 +123,8 @@ int add_to_swap_cache(struct page *page, return error; } +EXPORT_SYMBOL(add_to_swap_cache); + /* * This must be called only on pages that have * been verified to be in the swap cache. @@ -148,11 +154,18 @@ int add_to_swap(struct page *page) { swp_entry_t entry; int err; + struct user_beancounter *ub; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageUptodate(page)); - entry = get_swap_page(); + + ub = pb_grab_page_ub(page); + if (IS_ERR(ub)) + return 0; + + entry = get_swap_page(ub); + put_beancounter(ub); if (!entry.val) return 0; @@ -348,6 +361,8 @@ struct page *read_swap_cache_async(swp_e return found_page; } +EXPORT_SYMBOL(read_swap_cache_async); + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory diff -urNp linux-2.6.32.48/mm/truncate.c linux-2.6.32.48-openvz/mm/truncate.c --- linux-2.6.32.48/mm/truncate.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/truncate.c 2011-11-21 17:40:47.000000000 -0500 @@ -78,6 +78,7 @@ void cancel_dirty_page(struct page *page BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); + ub_io_release_context(page, account_size); } } } diff -urNp linux-2.6.32.48/mm/vmalloc.c linux-2.6.32.48-openvz/mm/vmalloc.c --- linux-2.6.32.48/mm/vmalloc.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/vmalloc.c 2011-11-21 17:40:47.000000000 -0500 @@ -31,6 +31,9 @@ #include #include +#include +#include + /*** Page table manipulation functions ***/ @@ -1359,7 +1362,7 @@ struct vm_struct *remove_vm_area(const v return NULL; } -static void __vunmap(const void *addr, int deallocate_pages) +static void __vunmap(const void *addr, int deallocate_pages, int uncharge) { struct vm_struct *area; @@ -1384,6 +1387,8 @@ static void __vunmap(const void *addr, i if (deallocate_pages) { int i; + if (uncharge) + dec_vmalloc_charged(area); for (i = 0; i < area->nr_pages; i++) { struct page *page = area->pages[i]; @@ -1417,7 +1422,7 @@ void vfree(const void *addr) kmemleak_free(addr); - __vunmap(addr, 1); + __vunmap(addr, 1, 1); } EXPORT_SYMBOL(vfree); @@ -1434,7 +1439,7 @@ void vunmap(const void *addr) { BUG_ON(in_interrupt()); might_sleep(); - __vunmap(addr, 0); + __vunmap(addr, 0, 0); } EXPORT_SYMBOL(vunmap); @@ -1521,10 +1526,12 @@ static void *__vmalloc_area_node(struct if (map_vm_area(area, prot, &pages)) goto fail; + + inc_vmalloc_charged(area, gfp_mask); return area->addr; fail: - vfree(area->addr); + __vunmap(area->addr, 1, 0); return NULL; } @@ -1609,6 +1616,26 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +void *ub_vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); +} +EXPORT_SYMBOL(ub_vmalloc); + +void *vmalloc_best(unsigned long size) +{ + return vmalloc(size); +} + +EXPORT_SYMBOL(vmalloc_best); + +void *ub_vmalloc_best(unsigned long size) +{ + return ub_vmalloc(size); +} + +EXPORT_SYMBOL(ub_vmalloc_best); + /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size @@ -1650,6 +1677,13 @@ void *vmalloc_node(unsigned long size, i } EXPORT_SYMBOL(vmalloc_node); +void *ub_vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, 1, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ub_vmalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -2345,6 +2379,40 @@ void pcpu_free_vm_areas(struct vm_struct kfree(vms); } +void vprintstat(void) +{ + struct vm_struct *p, *last_p = NULL; + unsigned long addr, size, free_size, max_free_size; + int num; + + addr = VMALLOC_START; + size = max_free_size = 0; + num = 0; + + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + free_size = (unsigned long)p->addr - addr; + if (free_size > max_free_size) + max_free_size = free_size; + addr = (unsigned long)p->addr + p->size; + size += p->size; + ++num; + last_p = p; + } + if (last_p) { + free_size = VMALLOC_END - + ((unsigned long)last_p->addr + last_p->size); + if (free_size > max_free_size) + max_free_size = free_size; + } + read_unlock(&vmlist_lock); + + printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n" + " Max_Free: %luKB Start: %lx End: %lx\n", + size/1024, (VMALLOC_END - VMALLOC_START)/1024, num, + max_free_size/1024, VMALLOC_START, VMALLOC_END); +} + #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) { diff -urNp linux-2.6.32.48/mm/vmscan.c linux-2.6.32.48-openvz/mm/vmscan.c --- linux-2.6.32.48/mm/vmscan.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/vmscan.c 2011-11-21 17:40:47.000000000 -0500 @@ -41,10 +41,14 @@ #include #include +#include +#include + #include #include #include +#include #include "internal.h" @@ -210,6 +214,9 @@ unsigned long shrink_slab(unsigned long if (scanned == 0) scanned = SWAP_CLUSTER_MAX; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + return 1; + if (!down_read_trylock(&shrinker_rwsem)) return 1; /* Assume we'll be able to shrink next time */ @@ -245,6 +252,9 @@ unsigned long shrink_slab(unsigned long int shrink_ret; int nr_before; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + goto done; + nr_before = (*shrinker->shrink)(0, gfp_mask); shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); if (shrink_ret == -1) @@ -259,6 +269,7 @@ unsigned long shrink_slab(unsigned long shrinker->nr += total_scan; } +done: up_read(&shrinker_rwsem); return ret; } @@ -376,6 +387,7 @@ static pageout_t pageout(struct page *pa */ if (page_has_private(page)) { if (try_to_free_buffers(page)) { + ub_io_release_context(page, 0); ClearPageDirty(page); printk("%s: orphaned page\n", __func__); return PAGE_CLEAN; @@ -1358,6 +1370,7 @@ static void shrink_active_list(unsigned struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); unsigned long nr_rotated = 0; + {KSTAT_PERF_ENTER(refill_inact) lru_add_drain(); spin_lock_irq(&zone->lru_lock); nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, @@ -1431,6 +1444,7 @@ static void shrink_active_list(unsigned LRU_BASE + file * LRU_FILE); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + KSTAT_PERF_LEAVE(refill_inact)} } static int inactive_anon_is_low_global(struct zone *zone) @@ -1673,6 +1687,8 @@ static void shrink_zone(int priority, st nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, priority); } + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + return; } /* * On large memory systems, scan >> priority can become @@ -1751,6 +1767,9 @@ static void shrink_zones(int priority, s } shrink_zone(priority, zone, sc); + + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + break; } } @@ -1782,10 +1801,13 @@ static unsigned long do_try_to_free_page struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + KSTAT_PERF_ENTER(ttfp); delayacct_freepages_start(); if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); + + ub_oom_start(); /* * mem_cgroup will not do shrink_slab. */ @@ -1834,6 +1856,11 @@ static unsigned long do_try_to_free_page sc->may_writepage = 1; } + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) { + ret = 1; + goto out; + } + /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1865,6 +1892,7 @@ out: delayacct_freepages_end(); + KSTAT_PERF_LEAVE(ttfp); return ret; } diff -urNp linux-2.6.32.48/mm/vmstat.c linux-2.6.32.48-openvz/mm/vmstat.c --- linux-2.6.32.48/mm/vmstat.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/mm/vmstat.c 2011-11-21 17:40:47.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -35,6 +36,20 @@ static void sum_vm_events(unsigned long } } +unsigned long vm_events(enum vm_event_item i) +{ + int cpu; + unsigned long sum; + struct vm_event_state *st; + + sum = 0; + for_each_online_cpu(cpu) { + st = &per_cpu(vm_event_states, cpu); + sum += st->event[i]; + } + + return (sum < 0 ? 0 : sum); +} /* * Accumulate the vm event counters across all CPUs. * The result is unavoidably approximate - it can change @@ -813,30 +828,40 @@ static void *vmstat_start(struct seq_fil unsigned long *v; #ifdef CONFIG_VM_EVENT_COUNTERS unsigned long *e; +#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + \ + sizeof(struct vm_event_state)) +#else +#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)) #endif int i; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; -#ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); -#endif + v = kmalloc(VMSTAT_BUFSIZE, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - v[i] = global_page_state(i); + + if (ve_is_super(get_exec_env())) { + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; -#endif + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; +#endif + } else + memset(v, 0, VMSTAT_BUFSIZE); + + if (virtinfo_notifier_call(VITYPE_GENERAL, + VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) { + kfree(v); + m->private = NULL; + return ERR_PTR(-ENOMSG); + } + return v + *pos; } @@ -955,7 +980,7 @@ static int __init setup_vmstat(void) #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); - proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); + proc_create("vmstat", S_IRUGO, &glob_proc_root, &proc_vmstat_file_operations); proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); #endif return 0; diff -urNp linux-2.6.32.48/net/8021q/vlan.c linux-2.6.32.48-openvz/net/8021q/vlan.c --- linux-2.6.32.48/net/8021q/vlan.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/8021q/vlan.c 2011-11-21 17:40:47.000000000 -0500 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -105,7 +106,7 @@ static struct vlan_group *vlan_group_all { struct vlan_group *grp; - grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL); + grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL_UBC); if (!grp) return NULL; @@ -127,7 +128,7 @@ static int vlan_group_prealloc_vid(struc return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; - array = kzalloc(size, GFP_KERNEL); + array = kzalloc(size, GFP_KERNEL_UBC); if (array == NULL) return -ENOBUFS; @@ -147,6 +148,7 @@ void unregister_vlan_dev(struct net_devi const struct net_device_ops *ops = real_dev->netdev_ops; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; + struct ve_struct *env; ASSERT_RTNL(); @@ -164,7 +166,9 @@ void unregister_vlan_dev(struct net_devi synchronize_net(); + env = set_exec_env(dev->owner_env); unregister_netdevice(dev); + set_exec_env(env); /* If the group is now empty, kill off the group. */ if (grp->nr_vlans == 0) { @@ -551,6 +555,17 @@ static struct notifier_block vlan_notifi .notifier_call = vlan_device_event, }; +static inline int vlan_check_caps(void) +{ + if (capable(CAP_NET_ADMIN)) + return 1; +#ifdef CONFIG_VE + if (capable(CAP_VE_NET_ADMIN)) + return 1; +#endif + return 0; +} + /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver @@ -592,7 +607,7 @@ static int vlan_ioctl_handler(struct net switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, @@ -602,7 +617,7 @@ static int vlan_ioctl_handler(struct net case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, @@ -611,7 +626,7 @@ static int vlan_ioctl_handler(struct net case SET_VLAN_FLAG_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, @@ -620,7 +635,7 @@ static int vlan_ioctl_handler(struct net case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; if ((args.u.name_type >= 0) && (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) { @@ -636,14 +651,14 @@ static int vlan_ioctl_handler(struct net case ADD_VLAN_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; unregister_vlan_dev(dev); err = 0; diff -urNp linux-2.6.32.48/net/8021q/vlan_dev.c linux-2.6.32.48-openvz/net/8021q/vlan_dev.c --- linux-2.6.32.48/net/8021q/vlan_dev.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/8021q/vlan_dev.c 2011-11-21 17:40:47.000000000 -0500 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -291,6 +292,7 @@ static int vlan_dev_hard_header(struct s static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ve_struct *env; int i = skb_get_queue_mapping(skb); struct netdev_queue *txq = netdev_get_tx_queue(dev, i); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); @@ -324,7 +326,10 @@ static netdev_tx_t vlan_dev_hard_start_x skb->dev = vlan_dev_info(dev)->real_dev; len = skb->len; + skb->owner_env = skb->dev->owner_env; + env = set_exec_env(skb->owner_env); ret = dev_queue_xmit(skb); + set_exec_env(env); if (likely(ret == NET_XMIT_SUCCESS)) { txq->tx_packets++; @@ -338,6 +343,7 @@ static netdev_tx_t vlan_dev_hard_start_x static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ve_struct *env; int i = skb_get_queue_mapping(skb); struct netdev_queue *txq = netdev_get_tx_queue(dev, i); u16 vlan_tci; @@ -350,7 +356,10 @@ static netdev_tx_t vlan_dev_hwaccel_hard skb->dev = vlan_dev_info(dev)->real_dev; len = skb->len; + skb->owner_env = skb->dev->owner_env; + env = set_exec_env(skb->owner_env); ret = dev_queue_xmit(skb); + set_exec_env(env); if (likely(ret == NET_XMIT_SUCCESS)) { txq->tx_packets++; @@ -829,4 +838,6 @@ void vlan_setup(struct net_device *dev) dev->ethtool_ops = &vlan_ethtool_ops; memset(dev->broadcast, 0, ETH_ALEN); + if (!ve_is_super(get_exec_env())) + dev->features |= NETIF_F_VIRTUAL; } diff -urNp linux-2.6.32.48/net/bridge/br.c linux-2.6.32.48-openvz/net/bridge/br.c --- linux-2.6.32.48/net/bridge/br.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/bridge/br.c 2011-11-21 17:40:47.000000000 -0500 @@ -64,6 +64,7 @@ static int __init br_init(void) brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; + br_hard_xmit_hook = br_xmit; #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) br_fdb_test_addr_hook = br_fdb_test_addr; @@ -101,6 +102,7 @@ static void __exit br_deinit(void) #endif br_handle_frame_hook = NULL; + br_hard_xmit_hook = NULL; br_fdb_fini(); } diff -urNp linux-2.6.32.48/net/bridge/br_device.c linux-2.6.32.48-openvz/net/bridge/br_device.c --- linux-2.6.32.48/net/bridge/br_device.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/bridge/br_device.c 2011-11-21 17:40:47.000000000 -0500 @@ -32,16 +32,47 @@ netdev_tx_t br_dev_xmit(struct sk_buff * skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); + skb->brmark = BR_ALREADY_SEEN; + if (dest[0] & 1) br_flood_deliver(br, skb); else if ((dst = __br_fdb_get(br, dest)) != NULL) - br_deliver(dst->dst, skb); + br_deliver(dst->dst, skb, 1); else br_flood_deliver(br, skb); return NETDEV_TX_OK; } +int br_xmit(struct sk_buff *skb, struct net_bridge_port *port) +{ + struct net_bridge *br = port->br; + const unsigned char *dest = skb->data; + struct net_bridge_fdb_entry *dst; + + if (!br->via_phys_dev) + return 0; + + br->dev->stats.tx_packets++; + br->dev->stats.tx_bytes += skb->len; + + skb_reset_mac_header(skb); + skb_pull(skb, ETH_HLEN); + + skb->brmark = BR_ALREADY_SEEN; + + if (dest[0] & 1) + br_xmit_deliver(br, port, skb); + else if ((dst = __br_fdb_get(br, dest)) != NULL) + br_deliver(dst->dst, skb, 0); + else + br_xmit_deliver(br, port, skb); + + skb_push(skb, ETH_HLEN); + + return 0; +} + static int br_dev_open(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); diff -urNp linux-2.6.32.48/net/bridge/br_forward.c linux-2.6.32.48-openvz/net/bridge/br_forward.c --- linux-2.6.32.48/net/bridge/br_forward.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/bridge/br_forward.c 2011-11-21 17:40:47.000000000 -0500 @@ -82,14 +82,24 @@ static void __br_forward(const struct ne } /* called with rcu_read_lock */ -void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb, int free) { if (should_deliver(to, skb)) { + if (!free) { + struct sk_buff *skb2; + + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { + to->dev->stats.tx_dropped++; + return; + } + skb = skb2; + } __br_deliver(to, skb); return; } - kfree_skb(skb); + if (free) + kfree_skb(skb); } /* called with rcu_read_lock */ @@ -105,6 +115,7 @@ void br_forward(const struct net_bridge_ /* called under bridge lock */ static void br_flood(struct net_bridge *br, struct sk_buff *skb, + int free, void (*__packet_hook)(const struct net_bridge_port *p, struct sk_buff *skb)) { @@ -136,18 +147,41 @@ static void br_flood(struct net_bridge * return; } - kfree_skb(skb); + if (free) + kfree_skb(skb); } /* called with rcu_read_lock */ void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb) { - br_flood(br, skb, __br_deliver); + br_flood(br, skb, 1, __br_deliver); +} + +/* called with rcu_read_lock */ +void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, + struct sk_buff *skb) +{ + struct net_bridge_port *p; + + list_for_each_entry_rcu(p, &br->port_list, list) { + if (p == port) + continue; + if (should_deliver(p, skb)) { + struct sk_buff *skb2; + + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { + br->dev->stats.tx_dropped++; + return; + } + __br_deliver(p, skb2); + } + } } /* called under bridge lock */ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb) { - br_flood(br, skb, __br_forward); + skb->brmark = BR_ALREADY_SEEN; + br_flood(br, skb, 1, __br_forward); } diff -urNp linux-2.6.32.48/net/bridge/br_if.c linux-2.6.32.48-openvz/net/bridge/br_if.c --- linux-2.6.32.48/net/bridge/br_if.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/bridge/br_if.c 2011-11-21 17:40:47.000000000 -0500 @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -158,6 +159,11 @@ static void del_br(struct net_bridge *br { struct net_bridge_port *p, *n; + if (br->master_dev) { + dev_put(br->master_dev); + br->master_dev = NULL; + } + list_for_each_entry_safe(p, n, &br->port_list, list) { del_nbp(p); } @@ -423,6 +429,10 @@ int br_add_if(struct net_bridge *br, str if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); + if (!(dev->features & NETIF_F_VIRTUAL) && !br->master_dev) { + dev_hold(dev); + br->master_dev = dev; + } spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, p); @@ -458,6 +468,16 @@ int br_del_if(struct net_bridge *br, str spin_lock_bh(&br->lock); br_stp_recalculate_bridge_id(br); br_features_recompute(br); + if (br->master_dev == dev) { + br->master_dev = NULL; + dev_put(dev); + list_for_each_entry(p, &br->port_list, list) + if (!(p->dev->features & NETIF_F_VIRTUAL)) { + dev_hold(p->dev); + br->master_dev = p->dev; + break; + } + } spin_unlock_bh(&br->lock); return 0; diff -urNp linux-2.6.32.48/net/bridge/br_input.c linux-2.6.32.48-openvz/net/bridge/br_input.c --- linux-2.6.32.48/net/bridge/br_input.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/bridge/br_input.c 2011-11-21 17:40:47.000000000 -0500 @@ -28,7 +28,13 @@ static void br_pass_frame_up(struct net_ brdev->stats.rx_bytes += skb->len; indev = skb->dev; - skb->dev = brdev; + if (!br->via_phys_dev) + skb->dev = brdev; + else { + skb->brmark = BR_ALREADY_SEEN; + if (br->master_dev) + skb->dev = br->master_dev; + } NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL, netif_receive_skb); @@ -56,7 +62,7 @@ int br_handle_frame_finish(struct sk_buf /* The packet skb2 goes to the local host (NULL to skip). */ skb2 = NULL; - if (br->dev->flags & IFF_PROMISC) + if ((br->dev->flags & IFF_PROMISC) && !br->via_phys_dev) skb2 = skb; dst = NULL; @@ -147,6 +153,8 @@ struct sk_buff *br_handle_frame(struct n forward: switch (p->state) { + struct net_device *out; + case BR_STATE_FORWARDING: rhook = rcu_dereference(br_should_route_hook); if (rhook != NULL) { @@ -156,7 +164,12 @@ forward: } /* fall through */ case BR_STATE_LEARNING: - if (!compare_ether_addr(p->br->dev->dev_addr, dest)) + if (skb->brmark == BR_ALREADY_SEEN) + return skb; + + out = p->br->via_phys_dev ? p->br->master_dev : p->br->dev; + + if (out && !compare_ether_addr(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, diff -urNp linux-2.6.32.48/net/bridge/br_ioctl.c linux-2.6.32.48-openvz/net/bridge/br_ioctl.c --- linux-2.6.32.48/net/bridge/br_ioctl.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/bridge/br_ioctl.c 2011-11-21 17:40:47.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -140,6 +141,7 @@ static int old_dev_ioctl(struct net_devi b.root_port = br->root_port; b.stp_enabled = (br->stp_enabled != BR_NO_STP); + b.via_phys_dev = br->via_phys_dev; b.ageing_time = jiffies_to_clock_t(br->ageing_time); b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); @@ -262,6 +264,13 @@ static int old_dev_ioctl(struct net_devi br_stp_set_enabled(br, args[1]); return 0; + case BRCTL_SET_VIA_ORIG_DEV: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br->via_phys_dev = args[1] ? 1 : 0; + return 0; + case BRCTL_SET_BRIDGE_PRIORITY: if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -371,6 +380,9 @@ static int old_deviceless(struct net *ne int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg) { + if (!(get_exec_env()->features & VE_FEATURE_BRIDGE)) + return -ENOTTY; + switch (cmd) { case SIOCGIFBR: case SIOCSIFBR: diff -urNp linux-2.6.32.48/net/bridge/br_private.h linux-2.6.32.48-openvz/net/bridge/br_private.h --- linux-2.6.32.48/net/bridge/br_private.h 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/bridge/br_private.h 2011-11-21 17:40:47.000000000 -0500 @@ -17,6 +17,10 @@ #include #include +#include +#include +#include + #define BR_HASH_BITS 8 #define BR_HASH_SIZE (1 << BR_HASH_BITS) @@ -92,6 +96,8 @@ struct net_bridge spinlock_t lock; struct list_head port_list; struct net_device *dev; + struct net_device *master_dev; + unsigned char via_phys_dev; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; struct list_head age_list; @@ -146,6 +152,7 @@ static inline int br_is_root_bridge(cons extern void br_dev_setup(struct net_device *dev); extern netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev); +extern netdev_tx_t br_xmit(struct sk_buff *skb, struct net_bridge_port *port); /* br_fdb.c */ extern int br_fdb_init(void); @@ -170,12 +177,13 @@ extern void br_fdb_update(struct net_bri /* br_forward.c */ extern void br_deliver(const struct net_bridge_port *to, - struct sk_buff *skb); + struct sk_buff *skb, int free); extern int br_dev_queue_push_xmit(struct sk_buff *skb); extern void br_forward(const struct net_bridge_port *to, struct sk_buff *skb); extern int br_forward_finish(struct sk_buff *skb); extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb); +extern void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb); extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb); /* br_if.c */ diff -urNp linux-2.6.32.48/net/bridge/br_sysfs_br.c linux-2.6.32.48-openvz/net/bridge/br_sysfs_br.c --- linux-2.6.32.48/net/bridge/br_sysfs_br.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/bridge/br_sysfs_br.c 2011-11-21 17:40:47.000000000 -0500 @@ -182,6 +182,28 @@ static ssize_t store_stp_state(struct de static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state, store_stp_state); +static ssize_t show_via_phys_dev_state(struct device *cd, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%d\n", br->via_phys_dev); +} + +static int set_via_phys_dev_state(struct net_bridge *br, unsigned long val) +{ + br->via_phys_dev = val ? 1 : 0; + return 0; +} + +static ssize_t store_via_phys_dev_state(struct device *cd, + struct device_attribute *attr, const char *buf, size_t len) +{ + return store_bridge_parm(cd, buf, len, set_via_phys_dev_state); +} + +static DEVICE_ATTR(via_phys_dev, S_IRUGO | S_IWUSR, show_via_phys_dev_state, + store_via_phys_dev_state); + static ssize_t show_priority(struct device *d, struct device_attribute *attr, char *buf) { @@ -351,6 +373,7 @@ static struct attribute *bridge_attrs[] &dev_attr_max_age.attr, &dev_attr_ageing_time.attr, &dev_attr_stp_state.attr, + &dev_attr_via_phys_dev.attr, &dev_attr_priority.attr, &dev_attr_bridge_id.attr, &dev_attr_root_id.attr, diff -urNp linux-2.6.32.48/net/core/datagram.c linux-2.6.32.48-openvz/net/core/datagram.c --- linux-2.6.32.48/net/core/datagram.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/datagram.c 2011-11-21 17:40:47.000000000 -0500 @@ -57,6 +57,8 @@ #include #include +#include + /* * Is a socket 'connection oriented' ? */ @@ -723,6 +725,7 @@ unsigned int datagram_poll(struct file * { struct sock *sk = sock->sk; unsigned int mask; + int no_ubc_space; sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; @@ -732,8 +735,14 @@ unsigned int datagram_poll(struct file * mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { + no_ubc_space = 0; mask |= POLLHUP; + } else { + no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); + if (no_ubc_space) + ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); + } /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || @@ -750,7 +759,7 @@ unsigned int datagram_poll(struct file * } /* writable? */ - if (sock_writeable(sk)) + if (!no_ubc_space && sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff -urNp linux-2.6.32.48/net/core/dev.c linux-2.6.32.48-openvz/net/core/dev.c --- linux-2.6.32.48/net/core/dev.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/dev.c 2011-11-21 17:40:47.000000000 -0500 @@ -130,6 +130,9 @@ #include "net-sysfs.h" +#include +#include + /* Instead of increasing this, you should create a hash table. */ #define MAX_GRO_SKBS 8 @@ -193,20 +196,6 @@ static struct list_head ptype_all __read DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS 8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) - -static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) -{ - unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; -} - -static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) -{ - return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; -} - /* Device list insertion */ static int list_netdevice(struct net_device *dev) { @@ -922,15 +911,10 @@ int dev_change_name(struct net_device *d strlcpy(dev->name, newname, IFNAMSIZ); rollback: - /* For now only devices in the initial network namespace - * are in sysfs. - */ - if (net == &init_net) { - ret = device_rename(&dev->dev, dev->name); - if (ret) { - memcpy(dev->name, oldname, IFNAMSIZ); - return ret; - } + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + return ret; } write_lock_bh(&dev_base_lock); @@ -1705,6 +1689,24 @@ static int dev_gso_segment(struct sk_buf return 0; } +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port); +EXPORT_SYMBOL(br_hard_xmit_hook); +static __inline__ int bridge_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct net_bridge_port *port; + + if (((port = rcu_dereference(dev->br_port)) == NULL) || + (skb->brmark == BR_ALREADY_SEEN)) + return 0; + + return br_hard_xmit_hook(skb, port); +} +#else +#define bridge_hard_start_xmit(skb, dev) (0) +#endif + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { @@ -1729,6 +1731,8 @@ int dev_hard_start_xmit(struct sk_buff * if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); + bridge_hard_start_xmit(skb, dev); + rc = ops->ndo_start_xmit(skb, dev); if (rc == NETDEV_TX_OK) txq_trans_update(txq); @@ -1763,6 +1767,8 @@ gso: if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); + bridge_hard_start_xmit(nskb, dev); + rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc != NETDEV_TX_OK)) { nskb->next = skb->next; @@ -2304,6 +2310,7 @@ int netif_receive_skb(struct sk_buff *sk struct net_device *null_or_orig; int ret = NET_RX_DROP; __be16 type; + struct ve_struct *old_ve; if (!skb->tstamp.tv64) net_timestamp(skb); @@ -2333,6 +2340,16 @@ int netif_receive_skb(struct sk_buff *sk skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; +#ifdef CONFIG_VE + /* + * Skb might be alloced in another VE context, than its device works. + * So, set the correct owner_env. + */ + skb->owner_env = skb->dev->owner_env; + BUG_ON(skb->owner_env == NULL); +#endif + old_ve = set_exec_env(skb->owner_env); + pt_prev = NULL; rcu_read_lock(); @@ -2391,6 +2408,7 @@ ncls: out: rcu_read_unlock(); + (void)set_exec_env(old_ve); return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -3412,8 +3430,13 @@ static int __dev_set_promiscuity(struct return -EOVERFLOW; } } - if (dev->flags != old_flags) { - printk(KERN_INFO "device %s %s promiscuous mode\n", + /* + * Promiscous mode on LOOPBACK/POINTTOPOINT devices does + * not mean anything + */ + if ((dev->flags != old_flags) && + !(dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) { + ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : "left"); if (audit_enabled) { @@ -4565,16 +4588,25 @@ int dev_ioctl(struct net *net, unsigned * - require strict serialization. * - do not return a value */ + case SIOCSIFMTU: + case SIOCSIFHWADDR: case SIOCSIFFLAGS: + case SIOCSIFTXQLEN: + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + case SIOCSIFMETRIC: - case SIOCSIFMTU: case SIOCSIFMAP: - case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFHWBROADCAST: - case SIOCSIFTXQLEN: case SIOCSMIIREG: case SIOCBONDENSLAVE: case SIOCBONDRELEASE: @@ -4637,12 +4669,11 @@ int dev_ioctl(struct net *net, unsigned */ static int dev_new_index(struct net *net) { - static int ifindex; for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return ifindex; + if (++net->ifindex <= 0) + net->ifindex = 1; + if (!__dev_get_by_index(net, net->ifindex)) + return net->ifindex; } } @@ -4797,6 +4828,10 @@ int register_netdevice(struct net_device BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); + ret = -EPERM; + if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) + goto out; + spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); netdev_init_queue_locks(dev); @@ -4867,6 +4902,10 @@ int register_netdevice(struct net_device set_bit(__LINK_STATE_PRESENT, &dev->state); + dev->owner_env = get_exec_env(); + netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub()); + netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub()); + dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); @@ -5047,12 +5086,14 @@ static void netdev_wait_allrefs(struct n void netdev_run_todo(void) { struct list_head list; + struct ve_struct *old_ve; /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); + old_ve = get_exec_env(); while (!list_empty(&list)) { struct net_device *dev = list_entry(list.next, struct net_device, todo_list); @@ -5065,6 +5106,7 @@ void netdev_run_todo(void) continue; } + (void)set_exec_env(dev->owner_env); dev->reg_state = NETREG_UNREGISTERED; on_each_cpu(flush_backlog, dev, 1); @@ -5077,12 +5119,21 @@ void netdev_run_todo(void) WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); + put_beancounter(netdev_bc(dev)->exec_ub); + put_beancounter(netdev_bc(dev)->owner_ub); + netdev_bc(dev)->exec_ub = NULL; + netdev_bc(dev)->owner_ub = NULL; + + /* It must be the very last action, + * after this 'dev' may point to freed up memory. + */ if (dev->destructor) dev->destructor(dev); /* Free network device */ kobject_put(&dev->dev.kobj); } + (void)set_exec_env(old_ve); } /** @@ -5165,13 +5216,13 @@ struct net_device *alloc_netdev_mq(int s /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL); + p = kzalloc(alloc_size, GFP_KERNEL_UBC); if (!p) { printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); return NULL; } - tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); + tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL_UBC); if (!tx) { printk(KERN_ERR "alloc_netdev: Unable to allocate " "tx qdiscs.\n"); @@ -5314,11 +5365,18 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, + struct user_beancounter *exec_ub) { char buf[IFNAMSIZ]; const char *destname; int err; + struct user_beancounter *tmp_ub; +#ifdef CONFIG_VE + struct ve_struct *cur_ve = get_exec_env(); + struct ve_struct *src_ve = dev->owner_env; + struct ve_struct *dst_ve = net->owner_ve; +#endif ASSERT_RTNL(); @@ -5327,15 +5385,6 @@ int dev_change_net_namespace(struct net_ if (dev->features & NETIF_F_NETNS_LOCAL) goto out; -#ifdef CONFIG_SYSFS - /* Don't allow real devices to be moved when sysfs - * is enabled. - */ - err = -EINVAL; - if (dev->dev.parent) - goto out; -#endif - /* Ensure the device has been registrered */ err = -EINVAL; if (dev->reg_state != NETREG_REGISTERED) @@ -5378,6 +5427,11 @@ int dev_change_net_namespace(struct net_ err = -ENODEV; unlist_netdevice(dev); + dev->owner_env = dst_ve; + tmp_ub = netdev_bc(dev)->exec_ub; + netdev_bc(dev)->exec_ub = get_beancounter(exec_ub); + put_beancounter(tmp_ub); + synchronize_net(); /* Shutdown queueing discipline. */ @@ -5386,7 +5440,9 @@ int dev_change_net_namespace(struct net_ /* Notify protocols, that we are about to destroy this device. They should clean all the things. */ + set_exec_env(src_ve); call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + (void)set_exec_env(cur_ve); /* * Flush the unicast and multicast chains @@ -5394,7 +5450,9 @@ int dev_change_net_namespace(struct net_ dev_unicast_flush(dev); dev_addr_discard(dev); + set_exec_env(src_ve); netdev_unregister_kobject(dev); + set_exec_env(cur_ve); /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -5412,14 +5470,18 @@ int dev_change_net_namespace(struct net_ } /* Fixup kobjects */ + set_exec_env(dst_ve); err = netdev_register_kobject(dev); + set_exec_env(cur_ve); WARN_ON(err); /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ + set_exec_env(dst_ve); call_netdevice_notifiers(NETDEV_REGISTER, dev); + (void)set_exec_env(cur_ve); /* * Prevent userspace races by waiting until the network @@ -5434,6 +5496,14 @@ out: } EXPORT_SYMBOL_GPL(dev_change_net_namespace); +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +{ + struct user_beancounter *ub = get_exec_ub(); + + return __dev_change_net_namespace(dev, net, pat, ub); +} +EXPORT_SYMBOL(__dev_change_net_namespace); + static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void *ocpu) @@ -5525,7 +5595,7 @@ static struct hlist_head *netdev_create_ int i; struct hlist_head *hash; - hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_UBC); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); @@ -5719,3 +5789,32 @@ static int __init initialize_hashrnd(voi late_initcall_sync(initialize_hashrnd); +static LIST_HEAD(dev_cpt_operations); + +void register_netdev_rst(struct netdev_rst *ops) +{ + rtnl_lock(); + list_add_tail(&ops->list, &dev_cpt_operations); + __rtnl_unlock(); +} +EXPORT_SYMBOL(register_netdev_rst); + +void unregister_netdev_rst(struct netdev_rst *ops) +{ + rtnl_lock(); + list_del(&ops->list); + __rtnl_unlock(); +} +EXPORT_SYMBOL(unregister_netdev_rst); + +struct netdev_rst *netdev_find_rst(int cpt_object, struct netdev_rst *ops) +{ + ops = list_prepare_entry(ops, &dev_cpt_operations, list); + + list_for_each_entry_continue(ops, &dev_cpt_operations, list) + if (ops->cpt_object == cpt_object) + return ops; + + return NULL; +} +EXPORT_SYMBOL(netdev_find_rst); diff -urNp linux-2.6.32.48/net/core/dst.c linux-2.6.32.48-openvz/net/core/dst.c --- linux-2.6.32.48/net/core/dst.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/dst.c 2011-11-21 17:40:47.000000000 -0500 @@ -313,6 +313,7 @@ static int dst_dev_event(struct notifier switch (event) { case NETDEV_UNREGISTER: case NETDEV_DOWN: + dst_gc_task(NULL); mutex_lock(&dst_gc_mutex); for (dst = dst_busy_list; dst; dst = dst->next) { last = dst; diff -urNp linux-2.6.32.48/net/core/ethtool.c linux-2.6.32.48-openvz/net/core/ethtool.c --- linux-2.6.32.48/net/core/ethtool.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/ethtool.c 2011-11-21 17:40:47.000000000 -0500 @@ -975,7 +975,7 @@ int dev_ethtool(struct net *net, struct case ETHTOOL_GRXCLSRLALL: break; default: - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; } diff -urNp linux-2.6.32.48/net/core/fib_rules.c linux-2.6.32.48-openvz/net/core/fib_rules.c --- linux-2.6.32.48/net/core/fib_rules.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/fib_rules.c 2011-11-21 17:40:47.000000000 -0500 @@ -20,7 +20,7 @@ int fib_default_rule_add(struct fib_rule { struct fib_rule *r; - r = kzalloc(ops->rule_size, GFP_KERNEL); + r = kzalloc(ops->rule_size, GFP_KERNEL_UBC); if (r == NULL) return -ENOMEM; @@ -238,7 +238,7 @@ static int fib_nl_newrule(struct sk_buff if (err < 0) goto errout; - rule = kzalloc(ops->rule_size, GFP_KERNEL); + rule = kzalloc(ops->rule_size, GFP_KERNEL_UBC); if (rule == NULL) { err = -ENOMEM; goto errout; diff -urNp linux-2.6.32.48/net/core/filter.c linux-2.6.32.48-openvz/net/core/filter.c --- linux-2.6.32.48/net/core/filter.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/filter.c 2011-11-21 17:40:47.000000000 -0500 @@ -505,7 +505,7 @@ int sk_attach_filter(struct sock_fprog * if (fprog->filter == NULL) return -EINVAL; - fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { diff -urNp linux-2.6.32.48/net/core/neighbour.c linux-2.6.32.48-openvz/net/core/neighbour.c --- linux-2.6.32.48/net/core/neighbour.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/neighbour.c 2011-11-21 17:40:47.000000000 -0500 @@ -21,6 +21,8 @@ #include #include #include +#include +#include #ifdef CONFIG_SYSCTL #include #endif @@ -35,6 +37,7 @@ #include #include #include +#include #define NEIGH_DEBUG 1 @@ -264,6 +267,7 @@ static struct neighbour *neigh_alloc(str int entries; entries = atomic_inc_return(&tbl->entries) - 1; + n = ERR_PTR(-ENOBUFS); if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -274,7 +278,7 @@ static struct neighbour *neigh_alloc(str n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); if (!n) - goto out_entries; + goto out_nomem; skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); @@ -291,6 +295,8 @@ static struct neighbour *neigh_alloc(str out: return n; +out_nomem: + n = ERR_PTR(-ENOMEM); out_entries: atomic_dec(&tbl->entries); goto out; @@ -409,12 +415,11 @@ struct neighbour *neigh_create(struct ne u32 hash_val; int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neighbour *n1, *rc, *n; - if (!n) { - rc = ERR_PTR(-ENOBUFS); + rc = n = neigh_alloc(tbl); + if (IS_ERR(n)) goto out; - } memcpy(n->primary_key, pkey, key_len); n->dev = dev; @@ -734,10 +739,21 @@ static void neigh_periodic_work(struct w if (atomic_read(&n->refcnt) == 1 && (state == NUD_FAILED || time_after(jiffies, n->used + n->parms->gc_staletime))) { + struct net_device *dev = n->dev; + struct ve_struct *ve; + struct user_beancounter *ub; + *np = n->next; n->dead = 1; write_unlock(&n->lock); + + ve = set_exec_env(dev->owner_env); + ub = set_exec_ub(netdev_bc(dev)->owner_ub); + neigh_cleanup_and_release(n); + + set_exec_ub(ub); + set_exec_env(ve); continue; } write_unlock(&n->lock); @@ -800,6 +816,11 @@ static void neigh_timer_handler(unsigned struct neighbour *neigh = (struct neighbour *)arg; unsigned state; int notify = 0; + struct ve_struct *env; + struct user_beancounter *ub; + + env = set_exec_env(neigh->dev->owner_env); + ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub); write_lock(&neigh->lock); @@ -885,6 +906,8 @@ out: neigh_update_notify(neigh); neigh_release(neigh); + (void)set_exec_ub(ub); + (void)set_exec_env(env); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) @@ -1276,9 +1299,16 @@ static void neigh_proxy_process(unsigned if (tdif <= 0) { struct net_device *dev = skb->dev; __skb_unlink(skb, &tbl->proxy_queue); - if (tbl->proxy_redo && netif_running(dev)) + if (tbl->proxy_redo && netif_running(dev)) { + struct ve_struct *ve; + struct user_beancounter *ub; + + ve = set_exec_env(dev->owner_env); + ub = set_exec_ub(netdev_bc(dev)->owner_ub); tbl->proxy_redo(skb); - else + set_exec_ub(ub); + set_exec_env(ve); + } else kfree_skb(skb); dev_put(dev); diff -urNp linux-2.6.32.48/net/core/net_namespace.c linux-2.6.32.48-openvz/net/core/net_namespace.c --- linux-2.6.32.48/net/core/net_namespace.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/net_namespace.c 2011-11-21 17:40:47.000000000 -0500 @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,10 @@ static __net_init int setup_net(struct n struct pernet_operations *ops; int error = 0; +#ifdef CONFIG_VE + net->owner_ve = get_exec_env(); +#endif + atomic_set(&net->count, 1); #ifdef NETNS_REFCNT_DEBUG @@ -106,6 +111,8 @@ out_free: static void net_free(struct net *net) { + struct completion *sysfs_completion; + #ifdef NETNS_REFCNT_DEBUG if (unlikely(atomic_read(&net->use_count) != 0)) { printk(KERN_EMERG "network namespace not free! Usage: %d\n", @@ -113,8 +120,11 @@ static void net_free(struct net *net) return; } #endif + sysfs_completion = net->sysfs_completion; kfree(net->gen); kmem_cache_free(net_cachep, net); + if (sysfs_completion) + complete(sysfs_completion); } static struct net *net_create(void) @@ -151,6 +161,7 @@ static void cleanup_net(struct work_stru { struct pernet_operations *ops; struct net *net; + struct ve_struct *old_ve; net = container_of(work, struct net, work); @@ -168,11 +179,13 @@ static void cleanup_net(struct work_stru */ synchronize_rcu(); + old_ve = set_exec_env(net->owner_ve); /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) { if (ops->exit) ops->exit(net); } + (void)set_exec_env(old_ve); mutex_unlock(&net_mutex); @@ -259,6 +272,16 @@ static int __init net_ns_init(void) pure_initcall(net_ns_init); #ifdef CONFIG_NET_NS + +#include + +static inline void set_net_context(struct net *net) +{ + set_exec_env(net->owner_ve); + if (net->loopback_dev) + set_exec_ub(netdev_bc(net->loopback_dev)->exec_ub); +} + static int register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { @@ -268,7 +291,9 @@ static int register_pernet_operations(st list_add_tail(&ops->list, list); if (ops->init) { for_each_net(net) { + set_net_context(net); error = ops->init(net); + set_net_context(&init_net); if (error) goto out_undo; } @@ -282,7 +307,10 @@ out_undo: for_each_net(undo_net) { if (undo_net == net) goto undone; + + set_net_context(undo_net); ops->exit(undo_net); + set_net_context(&init_net); } } undone: @@ -295,8 +323,11 @@ static void unregister_pernet_operations list_del(&ops->list); if (ops->exit) - for_each_net(net) + for_each_net(net) { + set_net_context(net); ops->exit(net); + set_net_context(&init_net); + } } #else diff -urNp linux-2.6.32.48/net/core/net-sysfs.c linux-2.6.32.48-openvz/net/core/net-sysfs.c --- linux-2.6.32.48/net/core/net-sysfs.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/net-sysfs.c 2011-11-21 17:40:47.000000000 -0500 @@ -268,6 +268,27 @@ static struct device_attribute net_class {} }; +#ifdef CONFIG_VE +struct device_attribute ve_net_class_attributes[] = { + __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), + __ATTR(iflink, S_IRUGO, show_iflink, NULL), + __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), + __ATTR(features, S_IRUGO, show_features, NULL), + __ATTR(type, S_IRUGO, show_type, NULL), + __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), + __ATTR(address, S_IRUGO, show_address, NULL), + __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), + __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(dormant, S_IRUGO, show_dormant, NULL), + __ATTR(operstate, S_IRUGO, show_operstate, NULL), + __ATTR(mtu, S_IRUGO, show_mtu, NULL), + __ATTR(flags, S_IRUGO, show_flags, NULL), + __ATTR(tx_queue_len, S_IRUGO, show_tx_queue_len, NULL), + {} +}; +EXPORT_SYMBOL(ve_net_class_attributes); +#endif + /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, struct device_attribute *attr, char *buf, @@ -430,9 +451,6 @@ static int netdev_uevent(struct device * struct net_device *dev = to_net_dev(d); int retval; - if (!net_eq(dev_net(dev), &init_net)) - return 0; - /* pass interface to uevent. */ retval = add_uevent_var(env, "INTERFACE=%s", dev->name); if (retval) @@ -462,7 +480,7 @@ static void netdev_release(struct device kfree((char *)dev - dev->padded); } -static struct class net_class = { +struct class net_class = { .name = "net", .dev_release = netdev_release, #ifdef CONFIG_SYSFS @@ -472,6 +490,13 @@ static struct class net_class = { .dev_uevent = netdev_uevent, #endif }; +EXPORT_SYMBOL(net_class); + +#ifndef CONFIG_VE +#define visible_net_class net_class +#else +#define visible_net_class (*get_exec_env()->net_class) +#endif /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. @@ -482,9 +507,6 @@ void netdev_unregister_kobject(struct ne kobject_get(&dev->kobj); - if (dev_net(net) != &init_net) - return; - device_del(dev); } @@ -494,7 +516,7 @@ int netdev_register_kobject(struct net_d struct device *dev = &(net->dev); const struct attribute_group **groups = net->sysfs_groups; - dev->class = &net_class; + dev->class = &visible_net_class; dev->platform_data = net; dev->groups = groups; @@ -509,9 +531,6 @@ int netdev_register_kobject(struct net_d #endif #endif /* CONFIG_SYSFS */ - if (dev_net(net) != &init_net) - return 0; - return device_add(dev); } @@ -534,7 +553,15 @@ void netdev_initialize_kobject(struct ne device_initialize(device); } +void prepare_sysfs_netdev(void) +{ +#ifdef CONFIG_VE + get_ve0()->net_class = &net_class; +#endif +} + int netdev_kobject_init(void) { + prepare_sysfs_netdev(); return class_register(&net_class); } diff -urNp linux-2.6.32.48/net/core/rtnetlink.c linux-2.6.32.48-openvz/net/core/rtnetlink.c --- linux-2.6.32.48/net/core/rtnetlink.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/rtnetlink.c 2011-11-21 17:40:47.000000000 -0500 @@ -1206,6 +1206,8 @@ static int rtnl_dump_all(struct sk_buff if (rtnl_msg_handlers[idx] == NULL || rtnl_msg_handlers[idx][type].dumpit == NULL) continue; + if (vz_security_family_check(idx)) + continue; if (idx > s_idx) memset(&cb->args[0], 0, sizeof(cb->args)); if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) @@ -1267,13 +1269,13 @@ static int rtnetlink_rcv_msg(struct sk_b return 0; family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family >= NPROTO) + if (family >= NPROTO || vz_security_family_check(family)) return -EAFNOSUPPORT; sz_idx = type>>2; kind = type&3; - if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) + if (kind != 2 && security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { diff -urNp linux-2.6.32.48/net/core/scm.c linux-2.6.32.48-openvz/net/core/scm.c --- linux-2.6.32.48/net/core/scm.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/scm.c 2011-11-21 18:22:00.000000000 -0500 @@ -36,6 +36,7 @@ #include #include +#include /* * Only allow a user to send credentials, that they could set with @@ -46,7 +47,9 @@ static __inline__ int scm_check_creds(st { const struct cred *cred = current_cred(); - if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && + if ((creds->pid == task_tgid_vnr(current) || + creds->pid == current->tgid || + capable(CAP_VE_SYS_ADMIN)) && ((creds->uid == cred->uid || creds->uid == cred->euid || creds->uid == cred->suid) || capable(CAP_SETUID)) && ((creds->gid == cred->gid || creds->gid == cred->egid || @@ -73,7 +76,7 @@ static int scm_fp_copy(struct cmsghdr *c if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_UBC); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -304,7 +307,7 @@ struct scm_fp_list *scm_fp_dup(struct sc return NULL; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), - GFP_KERNEL); + GFP_KERNEL_UBC); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); diff -urNp linux-2.6.32.48/net/core/skbuff.c linux-2.6.32.48-openvz/net/core/skbuff.c --- linux-2.6.32.48/net/core/skbuff.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/skbuff.c 2011-11-21 17:40:47.000000000 -0500 @@ -67,6 +67,7 @@ #include #include #include +#include #include "kmap_skb.h" @@ -184,6 +185,10 @@ struct sk_buff *__alloc_skb(unsigned int if (!skb) goto out; + if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) + goto nobc; + + /* Get the DATA. Size must match skb_add_mtu(). */ size = SKB_DATA_ALIGN(size); data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), gfp_mask, node); @@ -202,6 +207,7 @@ struct sk_buff *__alloc_skb(unsigned int skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; + skb->owner_env = get_exec_env(); kmemcheck_annotate_bitfield(skb, flags1); kmemcheck_annotate_bitfield(skb, flags2); #ifdef NET_SKBUFF_DATA_USES_OFFSET @@ -234,6 +240,8 @@ struct sk_buff *__alloc_skb(unsigned int out: return skb; nodata: + ub_skb_free_bc(skb); +nobc: kmem_cache_free(cache, skb); skb = NULL; goto out; @@ -362,6 +370,7 @@ static void kfree_skbmem(struct sk_buff struct sk_buff *other; atomic_t *fclone_ref; + ub_skb_free_bc(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); @@ -394,6 +403,7 @@ static void skb_release_head_state(struc #ifdef CONFIG_XFRM secpath_put(skb->sp); #endif + ub_skb_uncharge(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -560,6 +570,11 @@ static void __copy_skb_header(struct sk_ #endif new->vlan_tci = old->vlan_tci; +#ifdef CONFIG_VE + new->accounted = old->accounted; + new->redirected = old->redirected; +#endif + skb_copy_brmark(new, old); skb_copy_secmark(new, old); } @@ -581,6 +596,10 @@ static struct sk_buff *__skb_clone(struc n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; + C(owner_env); +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) + C(brmark); +#endif n->destructor = NULL; C(tail); C(end); @@ -589,6 +608,11 @@ static struct sk_buff *__skb_clone(struc C(truesize); atomic_set(&n->users, 1); +#ifdef CONFIG_VE + C(accounted); + C(redirected); +#endif + atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; @@ -647,6 +671,10 @@ struct sk_buff *skb_clone(struct sk_buff n->fclone = SKB_FCLONE_UNAVAILABLE; } + if (ub_skb_alloc_bc(n, gfp_mask)) { + kmem_cache_free(skbuff_head_cache, n); + return NULL; + } return __skb_clone(n, skb); } EXPORT_SYMBOL(skb_clone); diff -urNp linux-2.6.32.48/net/core/sock.c linux-2.6.32.48-openvz/net/core/sock.c --- linux-2.6.32.48/net/core/sock.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/sock.c 2011-11-21 17:40:47.000000000 -0500 @@ -124,6 +124,9 @@ #include #include +#include +#include + #include #ifdef CONFIG_INET @@ -254,7 +257,7 @@ static void sock_warn_obsolete_bsdism(co static char warncomm[TASK_COMM_LEN]; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); - printk(KERN_WARNING "process `%s' is using obsolete " + ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete " "%s SO_BSDCOMPAT\n", warncomm, name); warned++; } @@ -290,7 +293,7 @@ int sock_queue_rcv_skb(struct sock *sk, if (err) goto out; - if (!sk_rmem_schedule(sk, skb->truesize)) { + if (!sk_rmem_schedule(sk, skb)) { err = -ENOBUFS; goto out; } @@ -1006,6 +1009,7 @@ static void sk_prot_free(struct proto *p slab = prot->slab; security_sk_free(sk); + ub_sock_uncharge(sk); if (slab != NULL) kmem_cache_free(slab, sk); else @@ -1034,6 +1038,7 @@ struct sock *sk_alloc(struct net *net, i */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); + sk->owner_env = get_exec_env(); sock_net_set(sk, get_net(net)); atomic_set(&sk->sk_wmem_alloc, 1); } @@ -1146,14 +1151,11 @@ struct sock *sk_clone(const struct sock if (filter != NULL) sk_filter_charge(newsk, filter); - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - newsk = NULL; - goto out; - } + if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0) + goto out_err; + + if (unlikely(xfrm_sk_clone_policy(newsk))) + goto out_err; newsk->sk_err = 0; newsk->sk_priority = 0; @@ -1186,13 +1188,22 @@ struct sock *sk_clone(const struct sock sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) net_enable_timestamp(); } -out: return newsk; + +out_err: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sock_reset_flag(newsk, SOCK_TIMESTAMP); + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; } EXPORT_SYMBOL_GPL(sk_clone); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + extern int sysctl_tcp_use_sg; + __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) @@ -1205,6 +1216,8 @@ void sk_setup_caps(struct sock *sk, stru sk->sk_gso_max_size = dst->dev->gso_max_size; } } + if (!sysctl_tcp_use_sg) + sk->sk_route_caps &= ~NETIF_F_SG; } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1382,9 +1395,8 @@ static long sock_wait_for_wmem(struct so /* * Generic send/receive buffer handlers */ - -struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, - unsigned long data_len, int noblock, +struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size, + unsigned long size2, int noblock, int *errcode) { struct sk_buff *skb; @@ -1406,46 +1418,35 @@ struct sk_buff *sock_alloc_send_pskb(str if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int npages; - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - skb_frag_t *frag; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - frag = &skb_shinfo(skb)->frags[i]; - frag->page = page; - frag->page_offset = 0; - frag->size = (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len); - data_len -= PAGE_SIZE; - } + if (ub_sock_getwres_other(sk, skb_charge_size(size))) { + if (size2 < size) { + size = size2; + continue; + } + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = ub_sock_wait_for_space(sk, timeo, + skb_charge_size(size)); + continue; + } + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(size, gfp_mask); + if (skb) /* Full success... */ break; - } + ub_sock_retwres_other(sk, skb_charge_size(size), + SOCK_MIN_UBCSPACE_CH); err = -ENOBUFS; goto failure; } + ub_sock_retwres_other(sk, + skb_charge_size(size), + SOCK_MIN_UBCSPACE_CH); set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; @@ -1456,6 +1457,7 @@ struct sk_buff *sock_alloc_send_pskb(str timeo = sock_wait_for_wmem(sk, timeo); } + ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF); skb_set_owner_w(skb, sk); return skb; @@ -1465,12 +1467,12 @@ failure: *errcode = err; return NULL; } -EXPORT_SYMBOL(sock_alloc_send_pskb); +EXPORT_SYMBOL(sock_alloc_send_skb2); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_skb2(sk, size, size, noblock, errcode); } EXPORT_SYMBOL(sock_alloc_send_skb); @@ -1904,21 +1906,24 @@ void lock_sock_nested(struct sock *sk, i __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock(&sk->sk_lock.slock); +#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE) /* * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); +#endif local_bh_enable(); } EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { +#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE) /* * The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - +#endif spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); @@ -2194,7 +2199,7 @@ int proto_register(struct proto *prot, i { if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | prot->slab_flags, + SLAB_HWCACHE_ALIGN | SLAB_UBC | prot->slab_flags, NULL); if (prot->slab == NULL) { @@ -2213,7 +2218,7 @@ int proto_register(struct proto *prot, i sprintf(prot->rsk_prot->slab_name, mask, prot->name); prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, prot->rsk_prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (prot->rsk_prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", @@ -2235,7 +2240,7 @@ int proto_register(struct proto *prot, i kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, - SLAB_HWCACHE_ALIGN | + SLAB_HWCACHE_ALIGN | SLAB_UBC | prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) diff -urNp linux-2.6.32.48/net/core/stream.c linux-2.6.32.48-openvz/net/core/stream.c --- linux-2.6.32.48/net/core/stream.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/core/stream.c 2011-11-21 17:40:47.000000000 -0500 @@ -112,8 +112,10 @@ EXPORT_SYMBOL(sk_stream_wait_close); * sk_stream_wait_memory - Wait for more memory for a socket * @sk: socket to wait for memory * @timeo_p: for how long + * @amount - amount of memory to wait for (in UB space!) */ -int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, + unsigned long amount) { int err = 0; long vm_wait = 0; @@ -135,7 +137,10 @@ int sk_stream_wait_memory(struct sock *s if (signal_pending(current)) goto do_interrupted; clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - if (sk_stream_memory_free(sk) && !vm_wait) + if (amount == 0) { + if (sk_stream_memory_free(sk) && !vm_wait) + break; + } else if (!ub_sock_sndqueueadd_tcp(sk, amount)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); @@ -145,6 +150,8 @@ int sk_stream_wait_memory(struct sock *s (sk_stream_memory_free(sk) && !vm_wait)); sk->sk_write_pending--; + if (amount > 0) + ub_sock_sndqueuedel(sk); if (vm_wait) { vm_wait -= current_timeo; @@ -171,6 +178,10 @@ do_interrupted: goto out; } +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + return __sk_stream_wait_memory(sk, timeo_p, 0); +} EXPORT_SYMBOL(sk_stream_wait_memory); int sk_stream_error(struct sock *sk, int flags, int err) diff -urNp linux-2.6.32.48/net/dccp/ipv6.c linux-2.6.32.48-openvz/net/dccp/ipv6.c --- linux-2.6.32.48/net/dccp/ipv6.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/dccp/ipv6.c 2011-11-21 17:40:47.000000000 -0500 @@ -583,6 +583,8 @@ static struct sock *dccp_v6_request_recv __ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + if (!sysctl_tcp_use_sg) + newsk->sk_route_caps &= ~NETIF_F_SG; newdp6 = (struct dccp6_sock *)newsk; newinet = inet_sk(newsk); newinet->pinet6 = &newdp6->inet6; diff -urNp linux-2.6.32.48/net/dccp/minisocks.c linux-2.6.32.48-openvz/net/dccp/minisocks.c --- linux-2.6.32.48/net/dccp/minisocks.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/dccp/minisocks.c 2011-11-21 17:40:47.000000000 -0500 @@ -19,6 +19,8 @@ #include #include +#include + #include "ackvec.h" #include "ccid.h" #include "dccp.h" @@ -46,7 +48,8 @@ void dccp_time_wait(struct sock *sk, int { struct inet_timewait_sock *tw = NULL; - if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) + if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets && + ub_timewait_check(sk, &dccp_death_row)) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { diff -urNp linux-2.6.32.48/net/decnet/netfilter/dn_rtmsg.c linux-2.6.32.48-openvz/net/decnet/netfilter/dn_rtmsg.c --- linux-2.6.32.48/net/decnet/netfilter/dn_rtmsg.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/decnet/netfilter/dn_rtmsg.c 2011-11-21 17:40:47.000000000 -0500 @@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_sk if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); /* Eventually we might send routing messages too */ diff -urNp linux-2.6.32.48/net/ipv4/af_inet.c linux-2.6.32.48-openvz/net/ipv4/af_inet.c --- linux-2.6.32.48/net/ipv4/af_inet.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/af_inet.c 2011-11-21 17:40:47.000000000 -0500 @@ -115,6 +115,7 @@ #ifdef CONFIG_IP_MROUTE #include #endif +#include /* The inetsw table contains everything that inet_create needs to @@ -324,6 +325,10 @@ lookup_protocol: goto out_rcu_unlock; } + err = vz_security_protocol_check(answer->protocol); + if (err < 0) + goto out_rcu_unlock; + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; @@ -345,6 +350,13 @@ lookup_protocol: if (sk == NULL) goto out; + err = -ENOBUFS; + if (ub_sock_charge(sk, PF_INET, sock->type)) + goto out_sk_free; + /* if charge was successful, sock_init_data() MUST be called to + * set sk->sk_type. otherwise sk will be uncharged to wrong resource + */ + err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) @@ -402,6 +414,9 @@ out: out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_free: + sk_free(sk); + return err; } @@ -416,6 +431,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + struct ve_struct *saved_env; + + saved_env = set_exec_env(sk->owner_env); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -433,6 +451,8 @@ int inet_release(struct socket *sock) timeout = sk->sk_lingertime; sock->sk = NULL; sk->sk_prot->close(sk, timeout); + + (void)set_exec_env(saved_env); } return 0; } diff -urNp linux-2.6.32.48/net/ipv4/arp.c linux-2.6.32.48-openvz/net/ipv4/arp.c --- linux-2.6.32.48/net/ipv4/arp.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/arp.c 2011-11-21 17:40:47.000000000 -0500 @@ -1136,7 +1136,8 @@ int arp_ioctl(struct net *net, unsigned switch (cmd) { case SIOCDARP: case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) return -EPERM; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); diff -urNp linux-2.6.32.48/net/ipv4/devinet.c linux-2.6.32.48-openvz/net/ipv4/devinet.c --- linux-2.6.32.48/net/ipv4/devinet.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/devinet.c 2011-11-21 17:40:47.000000000 -0500 @@ -110,10 +110,11 @@ static inline void devinet_sysctl_unregi /* Locks all the inet devices. */ -static struct in_ifaddr *inet_alloc_ifa(void) +struct in_ifaddr *inet_alloc_ifa(void) { - return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL); + return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_UBC); } +EXPORT_SYMBOL_GPL(inet_alloc_ifa); static void inet_rcu_free_ifa(struct rcu_head *head) { @@ -146,7 +147,7 @@ void in_dev_finish_destroy(struct in_dev } } -static struct in_device *inetdev_init(struct net_device *dev) +struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; @@ -182,6 +183,7 @@ out_kfree: in_dev = NULL; goto out; } +EXPORT_SYMBOL_GPL(inetdev_init); static void in_dev_rcu_put(struct rcu_head *head) { @@ -375,7 +377,7 @@ static int __inet_insert_ifa(struct in_i return 0; } -static int inet_insert_ifa(struct in_ifaddr *ifa) +int inet_insert_ifa(struct in_ifaddr *ifa) { return __inet_insert_ifa(ifa, NULL, 0); } @@ -426,6 +428,7 @@ struct in_ifaddr *inet_ifa_byprefix(stru } endfor_ifa(in_dev); return NULL; } +EXPORT_SYMBOL_GPL(inet_insert_ifa); static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { @@ -624,7 +627,7 @@ int devinet_ioctl(struct net *net, unsig case SIOCSIFFLAGS: ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ @@ -632,7 +635,7 @@ int devinet_ioctl(struct net *net, unsig case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) diff -urNp linux-2.6.32.48/net/ipv4/fib_frontend.c linux-2.6.32.48-openvz/net/ipv4/fib_frontend.c --- linux-2.6.32.48/net/ipv4/fib_frontend.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/fib_frontend.c 2011-11-21 17:40:47.000000000 -0500 @@ -262,7 +262,8 @@ int fib_validate_source(__be32 src, __be net = dev_net(dev); if (fib_lookup(net, &fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) + if (res.type != RTN_UNICAST && + (!(dev->features & NETIF_F_VENET) || res.type != RTN_LOCAL)) goto e_inval_res; *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); @@ -464,7 +465,7 @@ int ip_rt_ioctl(struct net *net, unsigne switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&rt, arg, sizeof(rt))) diff -urNp linux-2.6.32.48/net/ipv4/fib_hash.c linux-2.6.32.48-openvz/net/ipv4/fib_hash.c --- linux-2.6.32.48/net/ipv4/fib_hash.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/fib_hash.c 2011-11-21 17:40:47.000000000 -0500 @@ -769,10 +769,10 @@ static int fn_hash_dump(struct fib_table void __init fib_hash_init(void) { fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_UBC, NULL); fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_UBC, NULL); } diff -urNp linux-2.6.32.48/net/ipv4/inet_connection_sock.c linux-2.6.32.48-openvz/net/ipv4/inet_connection_sock.c --- linux-2.6.32.48/net/ipv4/inet_connection_sock.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/inet_connection_sock.c 2011-11-21 17:40:47.000000000 -0500 @@ -24,6 +24,9 @@ #include #include +#include +#include + #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); @@ -165,6 +168,8 @@ have_snum: goto tb_not_found; tb_found: if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse > 1) + goto success; if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN && smallest_size == -1) { @@ -618,7 +623,7 @@ void inet_csk_destroy_sock(struct sock * sk_refcnt_debug_release(sk); - percpu_counter_dec(sk->sk_prot->orphan_count); + ub_dec_orphan_count(sk); sock_put(sk); } @@ -698,7 +703,7 @@ void inet_csk_listen_stop(struct sock *s sock_orphan(child); - percpu_counter_inc(sk->sk_prot->orphan_count); + ub_inc_orphan_count(sk); inet_csk_destroy_sock(child); diff -urNp linux-2.6.32.48/net/ipv4/inet_diag.c linux-2.6.32.48-openvz/net/ipv4/inet_diag.c --- linux-2.6.32.48/net/ipv4/inet_diag.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/inet_diag.c 2011-11-21 17:40:47.000000000 -0500 @@ -705,6 +705,7 @@ static int inet_diag_dump(struct sk_buff struct inet_diag_req *r = NLMSG_DATA(cb->nlh); const struct inet_diag_handler *handler; struct inet_hashinfo *hashinfo; + struct ve_struct *ve = get_exec_env(); handler = inet_diag_lock_handler(cb->nlh->nlmsg_type); if (IS_ERR(handler)) @@ -730,6 +731,8 @@ static int inet_diag_dump(struct sk_buff sk_nulls_for_each(sk, node, &ilb->head) { struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible(sk->owner_env, ve)) + continue; if (num < s_num) { num++; continue; @@ -796,6 +799,8 @@ skip_listen_ht: sk_nulls_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible(sk->owner_env, ve)) + continue; if (num < s_num) goto next_normal; if (!(r->idiag_states & (1 << sk->sk_state))) @@ -820,6 +825,8 @@ next_normal: inet_twsk_for_each(tw, node, &head->twchain) { + if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) + continue; if (num < s_num) goto next_dying; if (r->id.idiag_sport != tw->tw_sport && diff -urNp linux-2.6.32.48/net/ipv4/inet_fragment.c linux-2.6.32.48-openvz/net/ipv4/inet_fragment.c --- linux-2.6.32.48/net/ipv4/inet_fragment.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/inet_fragment.c 2011-11-21 17:40:47.000000000 -0500 @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -249,6 +250,9 @@ static struct inet_frag_queue *inet_frag spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); q->net = nf; +#ifdef CONFIG_VE + q->owner_ve = get_exec_env(); +#endif return q; } diff -urNp linux-2.6.32.48/net/ipv4/inet_timewait_sock.c linux-2.6.32.48-openvz/net/ipv4/inet_timewait_sock.c --- linux-2.6.32.48/net/ipv4/inet_timewait_sock.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/inet_timewait_sock.c 2011-11-21 17:40:47.000000000 -0500 @@ -14,6 +14,8 @@ #include #include +#include + /* Must be called with locally disabled BHs. */ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) @@ -115,9 +117,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance) struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = - kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, - GFP_ATOMIC); + struct user_beancounter *ub; + struct inet_timewait_sock *tw; + + ub = set_exec_ub(sock_bc(sk)->ub); + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + GFP_ATOMIC); + (void)set_exec_ub(ub); + if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); @@ -169,6 +176,7 @@ static int inet_twdr_do_twkill_work(stru rescan: inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { __inet_twsk_del_dead_node(tw); + ub_timewait_dec(tw, twdr); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); #ifdef CONFIG_NET_NS @@ -269,6 +277,7 @@ void inet_twsk_deschedule(struct inet_ti { spin_lock(&twdr->death_lock); if (inet_twsk_del_dead_node(tw)) { + ub_timewait_dec(tw, twdr); inet_twsk_put(tw); if (--twdr->tw_count == 0) del_timer(&twdr->tw_timer); @@ -315,9 +324,10 @@ void inet_twsk_schedule(struct inet_time spin_lock(&twdr->death_lock); /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) + if (inet_twsk_del_dead_node(tw)) { + ub_timewait_dec(tw, twdr); twdr->tw_count--; - else + } else atomic_inc(&tw->tw_refcnt); if (slot >= INET_TWDR_RECYCLE_SLOTS) { @@ -353,6 +363,7 @@ void inet_twsk_schedule(struct inet_time hlist_add_head(&tw->tw_death_node, list); + ub_timewait_inc(tw, twdr); if (twdr->tw_count++ == 0) mod_timer(&twdr->tw_timer, jiffies + twdr->period); spin_unlock(&twdr->death_lock); @@ -387,6 +398,7 @@ void inet_twdr_twcal_tick(unsigned long &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); + ub_timewait_dec(tw, twdr); #ifdef CONFIG_NET_NS NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); #endif diff -urNp linux-2.6.32.48/net/ipv4/ipconfig.c linux-2.6.32.48-openvz/net/ipv4/ipconfig.c --- linux-2.6.32.48/net/ipv4/ipconfig.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/ipconfig.c 2011-11-21 17:40:47.000000000 -0500 @@ -192,19 +192,20 @@ static int __init ic_open_devs(void) struct ic_device *d, **last; struct net_device *dev; unsigned short oflags; + struct net *net = get_exec_env()->ve_netns; last = &ic_first_dev; rtnl_lock(); /* bring loopback device up first */ - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if (!(dev->flags & IFF_LOOPBACK)) continue; if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); } - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if (dev->flags & IFF_LOOPBACK) continue; if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : @@ -459,9 +460,6 @@ ic_rarp_recv(struct sk_buff *skb, struct unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; - if (!net_eq(dev_net(dev), &init_net)) - goto drop; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; @@ -885,9 +883,6 @@ static int __init ic_bootp_recv(struct s struct ic_device *d; int len, ext_len; - if (!net_eq(dev_net(dev), &init_net)) - goto drop; - /* Perform verifications before taking the lock. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; diff -urNp linux-2.6.32.48/net/ipv4/ip_forward.c linux-2.6.32.48-openvz/net/ipv4/ip_forward.c --- linux-2.6.32.48/net/ipv4/ip_forward.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/ip_forward.c 2011-11-21 17:40:47.000000000 -0500 @@ -94,6 +94,24 @@ int ip_forward(struct sk_buff *skb) goto drop; } + /* + * We try to optimize forwarding of VE packets: + * do not decrement TTL (and so save skb_cow) + * during forwarding of outgoing pkts from VE. + * For incoming pkts we still do ttl decr, + * since such skb is not cloned and does not require + * actual cow. So, there is at least one place + * in pkts path with mandatory ttl decr, that is + * sufficient to prevent routing loops. + */ + iph = ip_hdr(skb); + if ( +#ifdef CONFIG_IP_ROUTE_NAT + (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */ +#endif /* and */ + (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */ + goto no_ttl_decr; + /* We are about to mangle packet. Copy it! */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; @@ -102,6 +120,8 @@ int ip_forward(struct sk_buff *skb) /* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); +no_ttl_decr: + /* * We now generate an ICMP HOST REDIRECT giving the route * we calculated. diff -urNp linux-2.6.32.48/net/ipv4/ip_fragment.c linux-2.6.32.48-openvz/net/ipv4/ip_fragment.c --- linux-2.6.32.48/net/ipv4/ip_fragment.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/ip_fragment.c 2011-11-21 17:40:47.000000000 -0500 @@ -186,10 +186,13 @@ static void ip_evictor(struct net *net) */ static void ip_expire(unsigned long arg) { + struct inet_frag_queue *q = (struct inet_frag_queue *)arg; struct ipq *qp; struct net *net; + struct ve_struct *old_ve; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(q, struct ipq, q); + old_ve = set_exec_env(q->owner_ve); net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock); @@ -214,6 +217,8 @@ static void ip_expire(unsigned long arg) out: spin_unlock(&qp->q.lock); ipq_put(qp); + + (void)set_exec_env(old_ve); } /* Find the correct entry in the "incomplete datagrams" queue for @@ -525,6 +530,7 @@ static int ip_frag_reasm(struct ipq *qp, clone->csum = 0; clone->ip_summed = head->ip_summed; atomic_add(clone->truesize, &qp->q.net->mem); + clone->owner_env = head->owner_env; } skb_shinfo(head)->frag_list = head->next; diff -urNp linux-2.6.32.48/net/ipv4/ip_gre.c linux-2.6.32.48-openvz/net/ipv4/ip_gre.c --- linux-2.6.32.48/net/ipv4/ip_gre.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/ip_gre.c 2011-11-21 17:40:47.000000000 -0500 @@ -50,6 +50,9 @@ #include #endif +#include +#include + /* Problems & solutions -------------------- @@ -1202,6 +1205,8 @@ static int ipgre_close(struct net_device #endif +static void ipgre_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx); static const struct net_device_ops ipgre_netdev_ops = { .ndo_init = ipgre_tunnel_init, .ndo_uninit = ipgre_tunnel_uninit, @@ -1212,6 +1217,7 @@ static const struct net_device_ops ipgre .ndo_start_xmit = ipgre_tunnel_xmit, .ndo_do_ioctl = ipgre_tunnel_ioctl, .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_cpt = ipgre_cpt, }; static void ipgre_tunnel_setup(struct net_device *dev) @@ -1297,6 +1303,112 @@ static void ipgre_destroy_tunnels(struct } } +static void ipgre_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx) +{ + struct cpt_tunnel_image v; + struct ip_tunnel *t; + struct ipgre_net *ign; + + t = netdev_priv(dev); + ign = net_generic(get_exec_env()->ve_netns, ipgre_net_id); + BUG_ON(ign == NULL); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + /* mark fb dev */ + v.cpt_tnl_flags = CPT_TUNNEL_GRE; + if (dev == ign->fb_tunnel_dev) + v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV; + + v.cpt_i_flags = t->parms.i_flags; + v.cpt_o_flags = t->parms.o_flags; + v.cpt_i_key = t->parms.i_key; + v.cpt_o_key = t->parms.o_key; + v.cpt_i_seqno = t->i_seqno; + v.cpt_o_seqno = t->o_seqno; + + BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph)); + memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph)); + + ops->write(&v, sizeof(v), ctx); +} + +static int ipgre_rst(loff_t start, struct cpt_netdev_image *di, + struct rst_ops *ops, struct cpt_context *ctx) +{ + int err = -ENODEV; + struct cpt_tunnel_image v; + struct net_device *dev; + struct ip_tunnel *t; + loff_t pos; + int fbdev; + struct ipgre_net *ign; + + ign = net_generic(get_exec_env()->ve_netns, ipgre_net_id); + if (ign == NULL) + return -EOPNOTSUPP; + + pos = start + di->cpt_hdrlen; + err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL, + pos, &v, sizeof(v), ctx); + if (err) + return err; + + /* some sanity */ + if (v.cpt_content != CPT_CONTENT_VOID) + return -EINVAL; + + if (!(v.cpt_tnl_flags & CPT_TUNNEL_GRE)) + return 1; + + if (v.cpt_tnl_flags & CPT_TUNNEL_FBDEV) { + fbdev = 1; + err = 0; + dev = ign->fb_tunnel_dev; + } else { + fbdev = 0; + err = -ENOMEM; + dev = alloc_netdev(sizeof(struct ip_tunnel), di->cpt_name, + ipgre_tunnel_setup); + if (!dev) + goto out; + } + + t = netdev_priv(dev); + t->parms.i_flags = v.cpt_i_flags; + t->parms.o_flags = v.cpt_o_flags; + t->parms.i_key = v.cpt_i_key; + t->parms.o_key = v.cpt_o_key; + t->i_seqno = v.cpt_i_seqno; + t->o_seqno = v.cpt_o_seqno; + + BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph)); + memcpy(&t->parms.iph, &v.cpt_iphdr, sizeof(t->parms.iph)); + + if (!fbdev) { + ipgre_tunnel_init(dev); + err = register_netdevice(dev); + if (err) { + free_netdev(dev); + goto out; + } + + dev_hold(dev); + ipgre_tunnel_link(ign, t); + } +out: + return err; +} + +static struct netdev_rst ipgre_netdev_rst = { + .cpt_object = CPT_OBJ_NET_IPIP_TUNNEL, + .ndo_rst = ipgre_rst, +}; + static int ipgre_init_net(struct net *net) { int err; @@ -1683,6 +1795,7 @@ static int __init ipgre_init(void) if (err < 0) goto tap_ops_failed; + register_netdev_rst(&ipgre_netdev_rst); out: return err; @@ -1697,6 +1810,7 @@ add_proto_failed: static void __exit ipgre_fini(void) { + unregister_netdev_rst(&ipgre_netdev_rst); rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) diff -urNp linux-2.6.32.48/net/ipv4/ip_input.c linux-2.6.32.48-openvz/net/ipv4/ip_input.c --- linux-2.6.32.48/net/ipv4/ip_input.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/ip_input.c 2011-11-21 17:40:47.000000000 -0500 @@ -193,6 +193,8 @@ static int ip_local_deliver_finish(struc { struct net *net = dev_net(skb->dev); + if (skb->destructor) + skb_orphan(skb); __skb_pull(skb, ip_hdrlen(skb)); /* Point into the IP datagram, just past the header. */ diff -urNp linux-2.6.32.48/net/ipv4/ipip.c linux-2.6.32.48-openvz/net/ipv4/ipip.c --- linux-2.6.32.48/net/ipv4/ipip.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/ipip.c 2011-11-21 18:17:58.000000000 -0500 @@ -106,6 +106,7 @@ #include #include #include +#include #include #include @@ -116,6 +117,9 @@ #include #include +#include +#include + #define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) @@ -144,6 +148,9 @@ static struct ip_tunnel * ipip_tunnel_lo struct ip_tunnel *t; struct ipip_net *ipn = net_generic(net, ipip_net_id); + if (ipn == NULL) + return NULL; + for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) @@ -686,11 +693,14 @@ static int ipip_tunnel_change_mtu(struct return 0; } +static void ipip_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx); static const struct net_device_ops ipip_netdev_ops = { .ndo_uninit = ipip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ipip_tunnel_change_mtu, + .ndo_cpt = ipip_cpt, }; @@ -762,11 +772,116 @@ static void ipip_destroy_tunnels(struct } } +static void ipip_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx) +{ + struct cpt_tunnel_image v; + struct ip_tunnel *t; + struct ipip_net *ipn; + + t = netdev_priv(dev); + ipn = net_generic(get_exec_env()->ve_netns, ipip_net_id); + BUG_ON(ipn == NULL); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + /* mark fb dev */ + v.cpt_tnl_flags = 0; + if (dev == ipn->fb_tunnel_dev) + v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV; + + v.cpt_i_flags = t->parms.i_flags; + v.cpt_o_flags = t->parms.o_flags; + v.cpt_i_key = t->parms.i_key; + v.cpt_o_key = t->parms.o_key; + + BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph)); + memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph)); + + ops->write(&v, sizeof(v), ctx); +} + +static int ipip_rst(loff_t start, struct cpt_netdev_image *di, + struct rst_ops *ops, struct cpt_context *ctx) +{ + int err = -ENODEV; + struct cpt_tunnel_image v; + struct net_device *dev; + struct ip_tunnel *t; + loff_t pos; + int fbdev; + struct ipip_net *ipn; + + ipn = net_generic(get_exec_env()->ve_netns, ipip_net_id); + if (ipn == NULL) + return -EOPNOTSUPP; + + pos = start + di->cpt_hdrlen; + err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL, + pos, &v, sizeof(v), ctx); + if (err) + return err; + + /* some sanity */ + if (v.cpt_content != CPT_CONTENT_VOID) + return -EINVAL; + + if (v.cpt_tnl_flags & (~CPT_TUNNEL_FBDEV)) + return 1; + + if (v.cpt_tnl_flags & CPT_TUNNEL_FBDEV) { + fbdev = 1; + err = 0; + dev = ipn->fb_tunnel_dev; + } else { + fbdev = 0; + err = -ENOMEM; + dev = alloc_netdev(sizeof(struct ip_tunnel), di->cpt_name, + ipip_tunnel_setup); + if (!dev) + goto out; + } + + t = netdev_priv(dev); + t->parms.i_flags = v.cpt_i_flags; + t->parms.o_flags = v.cpt_o_flags; + t->parms.i_key = v.cpt_i_key; + t->parms.o_key = v.cpt_o_key; + + BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph)); + memcpy(&t->parms.iph, &v.cpt_iphdr, sizeof(t->parms.iph)); + + if (!fbdev) { + ipip_tunnel_init(dev); + err = register_netdevice(dev); + if (err) { + free_netdev(dev); + goto out; + } + + dev_hold(dev); + ipip_tunnel_link(ipn, t); + } +out: + return err; +} + +static struct netdev_rst ipip_netdev_rst = { + .cpt_object = CPT_OBJ_NET_IPIP_TUNNEL, + .ndo_rst = ipip_rst, +}; + static int ipip_init_net(struct net *net) { int err; struct ipip_net *ipn; + if (!(get_exec_env()->features & VE_FEATURE_IPIP)) + return 0; + err = -ENOMEM; ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); if (ipn == NULL) @@ -812,6 +927,9 @@ static void ipip_exit_net(struct net *ne struct ipip_net *ipn; ipn = net_generic(net, ipip_net_id); + if (ipn == NULL) /* no VE_FEATURE_IPIP */ + return; + rtnl_lock(); ipip_destroy_tunnels(ipn); unregister_netdevice(ipn->fb_tunnel_dev); @@ -838,11 +956,18 @@ static int __init ipip_init(void) unregister_pernet_device(&ipip_net_ops); printk(KERN_INFO "ipip init: can't register tunnel\n"); } + err = register_netdev_rst(&ipip_netdev_rst); + if (err < 0) { + xfrm_tunnel_deregister(&ipip_handler, AF_INET); + unregister_pernet_device(&ipip_net_ops); + } + return err; } static void __exit ipip_fini(void) { + unregister_netdev_rst(&ipip_netdev_rst); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) printk(KERN_INFO "ipip close: can't deregister tunnel\n"); diff -urNp linux-2.6.32.48/net/ipv4/ip_output.c linux-2.6.32.48-openvz/net/ipv4/ip_output.c --- linux-2.6.32.48/net/ipv4/ip_output.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/ip_output.c 2011-11-21 17:40:47.000000000 -0500 @@ -1372,12 +1372,13 @@ void ip_send_reply(struct sock *sk, stru char data[40]; } replyopts; struct ipcm_cookie ipc; - __be32 daddr; + __be32 saddr, daddr; struct rtable *rt = skb_rtable(skb); if (ip_options_echo(&replyopts.opt, skb)) return; + saddr = ip_hdr(skb)->daddr; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; ipc.shtx.flags = 0; @@ -1393,7 +1394,7 @@ void ip_send_reply(struct sock *sk, stru struct flowi fl = { .oif = arg->bound_dev_if, .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = rt->rt_spec_dst, + .saddr = saddr, .tos = RT_TOS(ip_hdr(skb)->tos) } }, /* Not quite clean, but right. */ .uli_u = { .ports = diff -urNp linux-2.6.32.48/net/ipv4/ip_sockglue.c linux-2.6.32.48-openvz/net/ipv4/ip_sockglue.c --- linux-2.6.32.48/net/ipv4/ip_sockglue.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/ip_sockglue.c 2011-11-21 17:40:47.000000000 -0500 @@ -921,7 +921,7 @@ mc_msf_out: case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) break; err = xfrm_user_policy(sk, optname, optval, optlen); break; diff -urNp linux-2.6.32.48/net/ipv4/netfilter/ip_queue.c linux-2.6.32.48-openvz/net/ipv4/netfilter/ip_queue.c --- linux-2.6.32.48/net/ipv4/netfilter/ip_queue.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/ip_queue.c 2011-11-21 17:40:47.000000000 -0500 @@ -437,7 +437,7 @@ __ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); write_lock_bh(&queue_lock); @@ -467,8 +467,12 @@ __ipq_rcv_skb(struct sk_buff *skb) static void ipq_rcv_skb(struct sk_buff *skb) { + struct ve_struct *old_ve; + mutex_lock(&ipqnl_mutex); + old_ve = set_exec_env(skb->owner_env); __ipq_rcv_skb(skb); + (void)set_exec_env(old_ve); mutex_unlock(&ipqnl_mutex); } @@ -478,9 +482,6 @@ ipq_rcv_dev_event(struct notifier_block { struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) ipq_dev_drop(dev->ifindex); @@ -500,7 +501,7 @@ ipq_rcv_nl_event(struct notifier_block * if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL && n->pid) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if (n->pid == peer_pid) __ipq_reset(); write_unlock_bh(&queue_lock); } diff -urNp linux-2.6.32.48/net/ipv4/netfilter/iptable_filter.c linux-2.6.32.48-openvz/net/ipv4/netfilter/iptable_filter.c --- linux-2.6.32.48/net/ipv4/netfilter/iptable_filter.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/iptable_filter.c 2011-11-21 17:40:47.000000000 -0500 @@ -128,16 +128,24 @@ module_param(forward, bool, 0000); static int __net_init iptable_filter_net_init(struct net *net) { + if (!net_ipt_permitted(net, VE_IP_FILTER)) + return 0; + /* Register table */ net->ipv4.iptable_filter = ipt_register_table(net, &packet_filter, &initial_table.repl); if (IS_ERR(net->ipv4.iptable_filter)) return PTR_ERR(net->ipv4.iptable_filter); + + net_ipt_module_set(net, VE_IP_FILTER); return 0; } static void __net_exit iptable_filter_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_FILTER)) + return; + ipt_unregister_table(net->ipv4.iptable_filter); } diff -urNp linux-2.6.32.48/net/ipv4/netfilter/iptable_mangle.c linux-2.6.32.48-openvz/net/ipv4/netfilter/iptable_mangle.c --- linux-2.6.32.48/net/ipv4/netfilter/iptable_mangle.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/iptable_mangle.c 2011-11-21 17:40:47.000000000 -0500 @@ -198,16 +198,24 @@ static struct nf_hook_ops ipt_ops[] __re static int __net_init iptable_mangle_net_init(struct net *net) { + if (!net_ipt_permitted(net, VE_IP_MANGLE)) + return 0; + /* Register table */ net->ipv4.iptable_mangle = ipt_register_table(net, &packet_mangler, &initial_table.repl); if (IS_ERR(net->ipv4.iptable_mangle)) return PTR_ERR(net->ipv4.iptable_mangle); + + net_ipt_module_set(net, VE_IP_MANGLE); return 0; } static void __net_exit iptable_mangle_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_MANGLE)) + return; + ipt_unregister_table(net->ipv4.iptable_mangle); } diff -urNp linux-2.6.32.48/net/ipv4/netfilter/ip_tables.c linux-2.6.32.48-openvz/net/ipv4/netfilter/ip_tables.c --- linux-2.6.32.48/net/ipv4/netfilter/ip_tables.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/ip_tables.c 2011-11-21 17:40:47.000000000 -0500 @@ -321,6 +321,9 @@ ipt_do_table(struct sk_buff *skb, struct xt_match_param mtpar; struct xt_target_param tgpar; + if (ve_xt_table_forbidden(table)) + return NF_ACCEPT; + /* Initialization */ ip = ip_hdr(skb); indev = in ? in->name : nulldevname; @@ -466,8 +469,8 @@ mark_source_chains(struct xt_table_info int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - printk("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + ve_printk(VE_LOG, "iptables: loop hook %u pos " + "%u %08X.\n", hook, pos, e->comefrom); return 0; } e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); @@ -950,7 +953,7 @@ static struct xt_counters * alloc_counte (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = ub_vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1217,7 +1220,7 @@ __do_replace(struct net *net, const char void *loc_cpu_old_entry; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; @@ -1382,7 +1385,7 @@ do_add_counters(struct net *net, void __ if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = ub_vmalloc_node(len - size, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1857,13 +1860,15 @@ compat_do_replace(struct net *net, void return ret; } +static int do_ipt_set_ctl(struct sock *, int, void __user *, unsigned int); + static int compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1876,8 +1881,7 @@ compat_do_ipt_set_ctl(struct sock *sk, i break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); - ret = -EINVAL; + ret = do_ipt_set_ctl(sk, cmd, user, len); } return ret; @@ -1974,7 +1978,7 @@ compat_do_ipt_get_ctl(struct sock *sk, i { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1996,7 +2000,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2021,7 +2025,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2075,7 +2079,7 @@ struct xt_table *ipt_register_table(stru int ret; struct xt_table_info *newinfo; struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; struct xt_table *new_table; @@ -2221,11 +2225,22 @@ static struct xt_match icmp_matchstruct static int __net_init ip_tables_net_init(struct net *net) { - return xt_proto_init(net, NFPROTO_IPV4); + int res; + + if (!net_ipt_permitted(net, VE_IP_IPTABLES)) + return 0; + + res = xt_proto_init(net, NFPROTO_IPV4); + if (!res) + net_ipt_module_set(net, VE_IP_IPTABLES); + return res; } static void __net_exit ip_tables_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_IPTABLES)) + return; + xt_proto_fini(net, NFPROTO_IPV4); } diff -urNp linux-2.6.32.48/net/ipv4/netfilter/ipt_CLUSTERIP.c linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_CLUSTERIP.c --- linux-2.6.32.48/net/ipv4/netfilter/ipt_CLUSTERIP.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_CLUSTERIP.c 2011-11-21 17:40:47.000000000 -0500 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -383,7 +384,8 @@ static bool clusterip_tg_check(const str return false; } - dev = dev_get_by_name(&init_net, e->ip.iniface); + dev = dev_get_by_name(get_exec_env()->ve_netns, + e->ip.iniface); if (!dev) { printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); return false; diff -urNp linux-2.6.32.48/net/ipv4/netfilter/ipt_LOG.c linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_LOG.c --- linux-2.6.32.48/net/ipv4/netfilter/ipt_LOG.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_LOG.c 2011-11-21 17:40:47.000000000 -0500 @@ -47,32 +47,32 @@ static void dump_packet(const struct nf_ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Important fields: * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - printk("SRC=%pI4 DST=%pI4 ", + ve_printk(VE_LOG, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ if (ntohs(ih->frag_off) & IP_CE) - printk("CE "); + ve_printk(VE_LOG, "CE "); if (ntohs(ih->frag_off) & IP_DF) - printk("DF "); + ve_printk(VE_LOG, "DF "); if (ntohs(ih->frag_off) & IP_MF) - printk("MF "); + ve_printk(VE_LOG, "MF "); /* Max length: 11 "FRAG:65535 " */ if (ntohs(ih->frag_off) & IP_OFFSET) - printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { @@ -84,15 +84,15 @@ static void dump_packet(const struct nf_ op = skb_header_pointer(skb, iphoff+sizeof(_iph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } switch (ih->protocol) { @@ -101,7 +101,7 @@ static void dump_packet(const struct nf_ const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); + ve_printk(VE_LOG, "PROTO=TCP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -110,41 +110,41 @@ static void dump_packet(const struct nf_ th = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_tcph), &_tcph); if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & IPT_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", + ve_printk(VE_LOG, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); + ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3F " */ - printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) - printk("CWR "); + ve_printk(VE_LOG, "CWR "); if (th->ece) - printk("ECE "); + ve_printk(VE_LOG, "ECE "); if (th->urg) - printk("URG "); + ve_printk(VE_LOG, "URG "); if (th->ack) - printk("ACK "); + ve_printk(VE_LOG, "ACK "); if (th->psh) - printk("PSH "); + ve_printk(VE_LOG, "PSH "); if (th->rst) - printk("RST "); + ve_printk(VE_LOG, "RST "); if (th->syn) - printk("SYN "); + ve_printk(VE_LOG, "SYN "); if (th->fin) - printk("FIN "); + ve_printk(VE_LOG, "FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); + ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { @@ -157,15 +157,15 @@ static void dump_packet(const struct nf_ iphoff+ih->ihl*4+sizeof(_tcph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } break; } @@ -176,9 +176,9 @@ static void dump_packet(const struct nf_ if (ih->protocol == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); + ve_printk(VE_LOG, "PROTO=UDP " ); else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); + ve_printk(VE_LOG, "PROTO=UDPLITE "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -187,13 +187,13 @@ static void dump_packet(const struct nf_ uh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_udph), &_udph); if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); break; @@ -220,7 +220,7 @@ static void dump_packet(const struct nf_ [ICMP_ADDRESSREPLY] = 12 }; /* Max length: 11 "PROTO=ICMP " */ - printk("PROTO=ICMP "); + ve_printk(VE_LOG, "PROTO=ICMP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -229,19 +229,19 @@ static void dump_packet(const struct nf_ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_icmph), &_icmph); if (ich == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ich->type, ich->code); + ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ if (ich->type <= NR_ICMP_TYPES && required_len[ich->type] && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } @@ -250,35 +250,35 @@ static void dump_packet(const struct nf_ case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", + ve_printk(VE_LOG, "ID=%u SEQ=%u ", ntohs(ich->un.echo.id), ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ - printk("PARAMETER=%u ", + ve_printk(VE_LOG, "PARAMETER=%u ", ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ - printk("GATEWAY=%pI4 ", &ich->un.gateway); + ve_printk(VE_LOG, "GATEWAY=%pI4 ", &ich->un.gateway); /* Fall through */ case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_TIME_EXCEEDED: /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ - printk("["); + ve_printk(VE_LOG, "["); dump_packet(info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); - printk("] "); + ve_printk(VE_LOG, "] "); } /* Max length: 10 "MTU=65535 " */ if (ich->type == ICMP_DEST_UNREACH && ich->code == ICMP_FRAG_NEEDED) - printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu)); } break; } @@ -291,19 +291,19 @@ static void dump_packet(const struct nf_ break; /* Max length: 9 "PROTO=AH " */ - printk("PROTO=AH "); + ve_printk(VE_LOG, "PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ah = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_ahdr), &_ahdr); if (ah == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(ah->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { @@ -311,7 +311,7 @@ static void dump_packet(const struct nf_ const struct ip_esp_hdr *eh; /* Max length: 10 "PROTO=ESP " */ - printk("PROTO=ESP "); + ve_printk(VE_LOG, "PROTO=ESP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -320,25 +320,25 @@ static void dump_packet(const struct nf_ eh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_esph), &_esph); if (eh == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(eh->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: - printk("PROTO=%u ", ih->protocol); + ve_printk(VE_LOG, "PROTO=%u ", ih->protocol); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u ", + ve_printk(VE_LOG, "UID=%u GID=%u ", skb->sk->sk_socket->file->f_cred->fsuid, skb->sk->sk_socket->file->f_cred->fsgid); read_unlock_bh(&skb->sk->sk_callback_lock); @@ -346,7 +346,7 @@ static void dump_packet(const struct nf_ /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) - printk("MARK=0x%x ", skb->mark); + ve_printk(VE_LOG, "MARK=0x%x ", skb->mark); /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ @@ -386,7 +386,7 @@ ipt_log_packet(u_int8_t pf, loginfo = &default_loginfo; spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); @@ -397,30 +397,30 @@ ipt_log_packet(u_int8_t pf, physindev = skb->nf_bridge->physindev; if (physindev && in != physindev) - printk("PHYSIN=%s ", physindev->name); + ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name); physoutdev = skb->nf_bridge->physoutdev; if (physoutdev && out != physoutdev) - printk("PHYSOUT=%s ", physoutdev->name); + ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name); } #endif if (in && !out) { /* MAC logging for input chain only. */ - printk("MAC="); + ve_printk(VE_LOG, "MAC="); if (skb->dev && skb->dev->hard_header_len && skb->mac_header != skb->network_header) { int i; const unsigned char *p = skb_mac_header(skb); for (i = 0; i < skb->dev->hard_header_len; i++,p++) - printk("%02x%c", *p, + ve_printk(VE_LOG, "%02x%c", *p, i==skb->dev->hard_header_len - 1 ? ' ':':'); } else - printk(" "); + ve_printk(VE_LOG, " "); } dump_packet(loginfo, skb, 0); - printk("\n"); + ve_printk(VE_LOG, "\n"); spin_unlock_bh(&log_lock); } diff -urNp linux-2.6.32.48/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_MASQUERADE.c --- linux-2.6.32.48/net/ipv4/netfilter/ipt_MASQUERADE.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_MASQUERADE.c 2011-11-21 17:40:47.000000000 -0500 @@ -88,6 +88,7 @@ masquerade_tg(struct sk_buff *skb, const return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); } +#if 0 static int device_cmp(struct nf_conn *i, void *ifindex) { @@ -134,6 +135,7 @@ static struct notifier_block masq_dev_no static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; +#endif static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", @@ -152,12 +154,16 @@ static int __init masquerade_tg_init(voi ret = xt_register_target(&masquerade_tg_reg); +#if 0 +/* These notifiers are unnecessary and may + lead to oops in virtual environments */ if (ret == 0) { /* Register for device down reports */ register_netdevice_notifier(&masq_dev_notifier); /* Register IP address change reports */ register_inetaddr_notifier(&masq_inet_notifier); } +#endif return ret; } @@ -165,8 +171,8 @@ static int __init masquerade_tg_init(voi static void __exit masquerade_tg_exit(void) { xt_unregister_target(&masquerade_tg_reg); - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); +/* unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier);*/ } module_init(masquerade_tg_init); diff -urNp linux-2.6.32.48/net/ipv4/netfilter/ipt_REDIRECT.c linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_REDIRECT.c --- linux-2.6.32.48/net/ipv4/netfilter/ipt_REDIRECT.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_REDIRECT.c 2011-11-21 17:40:47.000000000 -0500 @@ -67,8 +67,13 @@ redirect_tg(struct sk_buff *skb, const s rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); - if (indev && (ifa = indev->ifa_list)) + if (indev && (ifa = indev->ifa_list)) { + /* because of venet device specific, we should use + * second ifa in the list */ + if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && ifa->ifa_next) + ifa = ifa->ifa_next; newdst = ifa->ifa_local; + } rcu_read_unlock(); if (!newdst) diff -urNp linux-2.6.32.48/net/ipv4/netfilter/ipt_REJECT.c linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_REJECT.c --- linux-2.6.32.48/net/ipv4/netfilter/ipt_REJECT.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/ipt_REJECT.c 2011-11-21 17:40:47.000000000 -0500 @@ -180,13 +180,13 @@ static bool reject_tg_check(const struct const struct ipt_entry *e = par->entryinfo; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { - printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); + ve_printk(VE_LOG, "ipt_REJECT: ECHOREPLY no longer supported.\n"); return false; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ if (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO)) { - printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); + ve_printk(VE_LOG, "ipt_REJECT: TCP_RESET invalid for non-tcp\n"); return false; } } diff -urNp linux-2.6.32.48/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c linux-2.6.32.48-openvz/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c --- linux-2.6.32.48/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2011-11-21 17:40:47.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -367,6 +368,30 @@ struct nf_conntrack_l3proto nf_conntrack .me = THIS_MODULE, }; +static int nf_conntrack_l3proto_ipv4_init_net(struct net *net) +{ + if (!net_ipt_permitted(net, VE_IP_CONNTRACK)) + return 0; + /* + * FIXME: + * Need virtualize per-net sysctls + */ + + net_ipt_module_set(net, VE_IP_CONNTRACK); + return 0; +} + +static void nf_conntrack_l3proto_ipv4_fini_net(struct net *net) +{ + if (!net_is_ipt_module_set(net, VE_IP_CONNTRACK)) + return; +} + +static struct pernet_operations nf_conntrack_ipv4_net_ops = { + .init = nf_conntrack_l3proto_ipv4_init_net, + .exit = nf_conntrack_l3proto_ipv4_fini_net, +}; + module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); @@ -381,6 +406,12 @@ static int __init nf_conntrack_l3proto_i need_conntrack(); nf_defrag_ipv4_enable(); + ret = register_pernet_subsys(&nf_conntrack_ipv4_net_ops); + if (ret) { + printk(KERN_ERR "nf_conntrack_ipv4: Unable to register pernet operations\n"); + return ret; + } + ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { printk(KERN_ERR "Unable to register netfilter socket option\n"); @@ -452,6 +483,7 @@ static void __exit nf_conntrack_l3proto_ nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); nf_unregister_sockopt(&so_getorigdst); + unregister_pernet_subsys(&nf_conntrack_ipv4_net_ops); } module_init(nf_conntrack_l3proto_ipv4_init); diff -urNp linux-2.6.32.48/net/ipv4/netfilter/nf_nat_core.c linux-2.6.32.48-openvz/net/ipv4/netfilter/nf_nat_core.c --- linux-2.6.32.48/net/ipv4/netfilter/nf_nat_core.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/nf_nat_core.c 2011-11-21 17:40:47.000000000 -0500 @@ -275,6 +275,22 @@ out: rcu_read_unlock(); } +void nf_nat_hash_conntrack(struct net *net, struct nf_conn *ct) +{ + unsigned int srchash; + struct nf_conn_nat *nat; + + srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_lock); + /* nf_conntrack_alter_reply might re-allocate exntension aera */ + nat = nfct_nat(ct); + nat->ct = ct; + hlist_add_head_rcu(&nat->bysource, + &net->ipv4.nat_bysource[srchash]); + spin_unlock_bh(&nf_nat_lock); +} +EXPORT_SYMBOL_GPL(nf_nat_hash_conntrack); + unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, @@ -324,18 +340,8 @@ nf_nat_setup_info(struct nf_conn *ct, } /* Place in source hash if this is the first time. */ - if (have_to_hash) { - unsigned int srchash; - - srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_lock); - /* nf_conntrack_alter_reply might re-allocate exntension aera */ - nat = nfct_nat(ct); - nat->ct = ct; - hlist_add_head_rcu(&nat->bysource, - &net->ipv4.nat_bysource[srchash]); - spin_unlock_bh(&nf_nat_lock); - } + if (have_to_hash) + nf_nat_hash_conntrack(net, ct); /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) @@ -676,6 +682,9 @@ nfnetlink_parse_nat_setup(struct nf_conn static int __net_init nf_nat_net_init(struct net *net) { + if (net_ipt_permitted(net, VE_IP_NAT)) + net_ipt_module_set(net, VE_IP_NAT); + /* Leave them the same for the moment. */ net->ipv4.nat_htable_size = net->ct.htable_size; net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, diff -urNp linux-2.6.32.48/net/ipv4/netfilter/nf_nat_rule.c linux-2.6.32.48-openvz/net/ipv4/netfilter/nf_nat_rule.c --- linux-2.6.32.48/net/ipv4/netfilter/nf_nat_rule.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/netfilter/nf_nat_rule.c 2011-11-21 17:40:47.000000000 -0500 @@ -186,15 +186,24 @@ static struct xt_target ipt_dnat_reg __r static int __net_init nf_nat_rule_net_init(struct net *net) { + if (!net_ipt_permitted(net, VE_IP_IPTABLE_NAT)) + return 0; + net->ipv4.nat_table = ipt_register_table(net, &nat_table, &nat_initial_table.repl); if (IS_ERR(net->ipv4.nat_table)) return PTR_ERR(net->ipv4.nat_table); + + net_ipt_module_set(net, VE_IP_IPTABLE_NAT); + return 0; } static void __net_exit nf_nat_rule_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_IPTABLE_NAT)) + return; + ipt_unregister_table(net->ipv4.nat_table); } diff -urNp linux-2.6.32.48/net/ipv4/proc.c linux-2.6.32.48-openvz/net/ipv4/proc.c --- linux-2.6.32.48/net/ipv4/proc.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/proc.c 2011-11-21 17:40:47.000000000 -0500 @@ -54,7 +54,7 @@ static int sockstat_seq_show(struct seq_ int orphans, sockets; local_bh_disable(); - orphans = percpu_counter_sum_positive(&tcp_orphan_count); + orphans = percpu_counter_sum_positive(&get_exec_ub()->ub_orphan_count); sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); local_bh_enable(); diff -urNp linux-2.6.32.48/net/ipv4/route.c linux-2.6.32.48-openvz/net/ipv4/route.c --- linux-2.6.32.48/net/ipv4/route.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/route.c 2011-11-21 17:40:47.000000000 -0500 @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +117,7 @@ #define RT_GC_TIMEOUT (300*HZ) +int ip_rt_src_check = 1; static int ip_rt_max_size; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static int ip_rt_gc_interval __read_mostly = 60 * HZ; @@ -1421,6 +1423,9 @@ void ip_rt_redirect(__be32 old_gw, __be3 rt->u.dst.xfrm = NULL; #endif rt->rt_genid = rt_genid(net); +#ifdef CONFIG_VE + rt->fl.owner_env = get_exec_env(); +#endif rt->rt_flags |= RTCF_REDIRECTED; /* Gateway is different ... */ @@ -1877,9 +1882,12 @@ static int ip_route_input_mc(struct sk_b #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; + rth->u.dst.dev = get_exec_env()->ve_netns->loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; @@ -2015,6 +2023,9 @@ static int __mkroute_input(struct sk_buf rth->fl.fl4_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = rth->fl.iif = in_dev->dev->ifindex; rth->u.dst.dev = (out_dev)->dev; @@ -2209,6 +2220,9 @@ local_input: rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->u.dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; if (res.type == RTN_UNREACHABLE) { @@ -2402,6 +2416,9 @@ static int __mkroute_output(struct rtabl rth->fl.mark = oldflp->mark; rth->rt_dst = fl->fl4_dst; rth->rt_src = fl->fl4_src; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = oldflp->oif ? : dev_out->ifindex; /* get references to the devices that are to be hold by the routing cache entry */ @@ -2542,7 +2559,7 @@ static int ip_route_output_slow(struct n goto make_route; } - if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { + if (!(oldflp->flags & FLOWI_FLAG_ANYSRC) && ip_rt_src_check) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(net, oldflp->fl4_src); if (dev_out == NULL) diff -urNp linux-2.6.32.48/net/ipv4/sysctl_net_ipv4.c linux-2.6.32.48-openvz/net/ipv4/sysctl_net_ipv4.c --- linux-2.6.32.48/net/ipv4/sysctl_net_ipv4.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/sysctl_net_ipv4.c 2011-11-21 17:40:47.000000000 -0500 @@ -26,6 +26,9 @@ static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; +int sysctl_tcp_use_sg = 1; +EXPORT_SYMBOL(sysctl_tcp_use_sg); + /* Update system visible IP port range */ static void set_local_port_range(int range[2]) { @@ -796,6 +799,27 @@ static struct ctl_table ipv4_net_table[] .proc_handler = proc_dointvec }, { + .procname = "tcp_max_tw_kmem_fraction", + .data = &sysctl_tcp_max_tw_kmem_fraction, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_max_tw_buckets_ub", + .data = &sysctl_tcp_max_tw_buckets_ub, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_use_sg", + .data = &sysctl_tcp_use_sg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .ctl_name = CTL_UNNUMBERED, .procname = "rt_cache_rebuild_count", .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, diff -urNp linux-2.6.32.48/net/ipv4/tcp.c linux-2.6.32.48-openvz/net/ipv4/tcp.c --- linux-2.6.32.48/net/ipv4/tcp.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/tcp.c 2011-11-21 17:40:47.000000000 -0500 @@ -272,6 +272,10 @@ #include #include +#include +#include +#include + #include #include @@ -375,6 +379,7 @@ unsigned int tcp_poll(struct file *file, unsigned int mask; struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); + int check_send_space; sock_poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) @@ -387,6 +392,21 @@ unsigned int tcp_poll(struct file *file, mask = 0; + check_send_space = 1; +#ifdef CONFIG_BEANCOUNTERS + if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) { + unsigned long size; + size = MAX_TCP_HEADER + tp->mss_cache; + if (size > SOCK_MIN_UBCSPACE) + size = SOCK_MIN_UBCSPACE; + size = skb_charge_size(size); + if (ub_sock_makewres_tcp(sk, size)) { + check_send_space = 0; + ub_sock_sndqueueadd_tcp(sk, size); + } + } +#endif + /* * POLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -434,7 +454,7 @@ unsigned int tcp_poll(struct file *file, if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ @@ -688,7 +708,7 @@ struct sk_buff *sk_stream_alloc_skb(stru skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (sk_wmem_schedule(sk, skb->truesize, skb)) { /* * Make sure that we have exactly size bytes * available to the caller, no more, no less. @@ -774,15 +794,23 @@ static ssize_t do_tcp_sendpages(struct s int copy, i, can_coalesce; int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); + unsigned long chargesize = 0; if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: + chargesize = 0; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + chargesize = skb_charge_size(MAX_TCP_HEADER + + tp->mss_cache); + if (ub_sock_getwres_tcp(sk, chargesize) < 0) + goto wait_for_ubspace; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (!skb) goto wait_for_memory; + ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + chargesize = 0; skb_entail(sk, skb); copy = size_goal; @@ -797,7 +825,7 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (!sk_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy, skb)) goto wait_for_memory; if (can_coalesce) { @@ -838,10 +866,15 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: + ub_sock_retwres_tcp(sk, chargesize, + skb_charge_size(MAX_TCP_HEADER + tp->mss_cache)); + chargesize = 0; +wait_for_ubspace: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = __sk_stream_wait_memory(sk, &timeo, chargesize); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -877,12 +910,8 @@ ssize_t tcp_sendpage(struct socket *sock return res; } -#define TCP_PAGE(sk) (sk->sk_sndmsg_page) -#define TCP_OFF(sk) (sk->sk_sndmsg_off) - -static inline int select_size(struct sock *sk) +static inline int select_size(struct sock *sk, struct tcp_sock *tp) { - struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { @@ -940,6 +969,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru while (--iovlen >= 0) { size_t seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; + unsigned long chargesize = 0; iov++; @@ -955,17 +985,27 @@ int tcp_sendmsg(struct kiocb *iocb, stru } if (copy <= 0) { + unsigned long size; new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ + chargesize = 0; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, select_size(sk), + size = select_size(sk, tp); + chargesize = skb_charge_size(MAX_TCP_HEADER + + size); + if (ub_sock_getwres_tcp(sk, chargesize) < 0) + goto wait_for_ubspace; + skb = sk_stream_alloc_skb(sk, size, sk->sk_allocation); if (!skb) goto wait_for_memory; + ub_skb_set_charge(skb, sk, chargesize, + UB_TCPSNDBUF); + chargesize = 0; /* * Check whether we can use HW checksum. @@ -1012,6 +1052,7 @@ new_segment: } else if (page) { if (off == PAGE_SIZE) { put_page(page); + ub_sock_tcp_detachpage(sk); TCP_PAGE(sk) = page = NULL; off = 0; } @@ -1021,10 +1062,13 @@ new_segment: if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off; - if (!sk_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy, skb)) goto wait_for_memory; if (!page) { + chargesize = PAGE_SIZE; + if (ub_sock_tcp_chargepage(sk) < 0) + goto wait_for_ubspace; /* Allocate new cache page. */ if (!(page = sk_stream_alloc_page(sk))) goto wait_for_memory; @@ -1056,7 +1100,8 @@ new_segment: } else if (off + copy < PAGE_SIZE) { get_page(page); TCP_PAGE(sk) = page; - } + } else + ub_sock_tcp_detachpage(sk); } TCP_OFF(sk) = off + copy; @@ -1087,10 +1132,15 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: + ub_sock_retwres_tcp(sk, chargesize, + skb_charge_size(MAX_TCP_HEADER+tp->mss_cache)); + chargesize = 0; +wait_for_ubspace: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = __sk_stream_wait_memory(sk, &timeo, chargesize); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1188,8 +1238,10 @@ void tcp_cleanup_rbuf(struct sock *sk, i struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), - KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", - tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); + KERN_INFO "cleanup rbuf bug (%d/%s): copied %X seq %X/%X rcvnxt %X\n", + VEID(get_exec_env()), current->comm, + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, + TCP_SKB_CB(skb)->seq, tp->rcv_nxt); #endif if (inet_csk_ack_scheduled(sk)) { @@ -1451,8 +1503,9 @@ int tcp_recvmsg(struct kiocb *iocb, stru goto found_ok_skb; if (tcp_hdr(skb)->fin) goto found_fin_ok; - WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " + WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2 (%d/%s): " "copied %X seq %X rcvnxt %X fl %X\n", + VEID(get_exec_env()), current->comm, *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } @@ -1515,8 +1568,19 @@ int tcp_recvmsg(struct kiocb *iocb, stru tp->ucopy.len = len; - WARN_ON(tp->copied_seq != tp->rcv_nxt && - !(flags & (MSG_PEEK | MSG_TRUNC))); + if (WARN_ON(tp->copied_seq != tp->rcv_nxt && + !(flags & (MSG_PEEK | MSG_TRUNC)))) { + printk("KERNEL: assertion: tp->copied_seq == " + "tp->rcv_nxt || ...\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? + VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("flags=0x%x, len=%d, copied_seq=%d, " + "rcv_nxt=%d\n", flags, + (int)len, tp->copied_seq, + tp->rcv_nxt); + } /* Ugly... If prequeue is not empty, we have to * process it before releasing socket, otherwise @@ -1940,7 +2004,7 @@ adjudge_to_death: bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); - percpu_counter_inc(sk->sk_prot->orphan_count); + ub_inc_orphan_count(sk); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) @@ -1981,10 +2045,12 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, 0)) { + if (ub_too_many_orphans(sk, 0)) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " - "sockets\n"); + "sockets (%d in CT%d)\n", + ub_get_orphan_count(sk), + sock_has_ubc(sk) ? sock_bc(sk)->ub->ub_uid : -1); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), @@ -2061,6 +2127,7 @@ int tcp_disconnect(struct sock *sk, int tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; + tp->advmss = 65535; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2888,10 +2955,11 @@ void __init tcp_init(void) percpu_counter_init(&tcp_sockets_allocated, 0); percpu_counter_init(&tcp_orphan_count, 0); + percpu_counter_init(&get_ub0()->ub_orphan_count, 0); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); /* Size and allocate the main established and bind bucket * hash tables. @@ -2952,6 +3020,11 @@ void __init tcp_init(void) sysctl_tcp_mem[1] = limit; sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096) + sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096; + if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096) + sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096; + /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); diff -urNp linux-2.6.32.48/net/ipv4/tcp_input.c linux-2.6.32.48-openvz/net/ipv4/tcp_input.c --- linux-2.6.32.48/net/ipv4/tcp_input.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/tcp_input.c 2011-11-21 17:40:47.000000000 -0500 @@ -72,6 +72,8 @@ #include #include +#include + int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; @@ -307,7 +309,7 @@ static void tcp_grow_window(struct sock /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_memory_pressure) { + ub_tcp_rmem_allows_expand(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -377,6 +379,8 @@ static void tcp_init_buffer_space(struct tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_time_stamp; + + ub_tcp_update_maxadvmss(sk); } /* 5. Recalculate window clamp after socket hit its memory bounds. */ @@ -389,7 +393,7 @@ static void tcp_clamp_window(struct sock if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && + !ub_tcp_memory_pressure(sk) && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -4270,19 +4274,19 @@ static void tcp_ofo_queue(struct sock *s static int tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); -static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static inline int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, size)) { + !sk_rmem_schedule(sk, skb)) { if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, size)) { + if (!sk_rmem_schedule(sk, skb)) { if (!tcp_prune_ofo_queue(sk)) return -1; - if (!sk_rmem_schedule(sk, size)) + if (!sk_rmem_schedule(sk, skb)) return -1; } } @@ -4334,8 +4338,8 @@ static void tcp_data_queue(struct sock * if (eaten <= 0) { queue_and_out: if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb->truesize)) - goto drop; + tcp_try_rmem_schedule(sk, skb)) + goto drop_part; skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); @@ -4379,6 +4383,12 @@ out_of_window: drop: __kfree_skb(skb); return; + +drop_part: + if (after(tp->copied_seq, tp->rcv_nxt)) + tp->rcv_nxt = tp->copied_seq; + __kfree_skb(skb); + return; } /* Out of window. F.e. zero window probe. */ @@ -4405,7 +4415,7 @@ drop: TCP_ECN_check_ce(tp, skb); - if (tcp_try_rmem_schedule(sk, skb->truesize)) + if (tcp_try_rmem_schedule(sk, skb)) goto drop; /* Disable header prediction. */ @@ -4591,6 +4601,10 @@ restart: nskb = alloc_skb(copy + header, GFP_ATOMIC); if (!nskb) return; + if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) { + kfree_skb(nskb); + return; + } skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); skb_set_network_header(nskb, (skb_network_header(skb) - @@ -4719,7 +4733,7 @@ static int tcp_prune_queue(struct sock * if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (tcp_memory_pressure) + else if (ub_tcp_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4785,7 +4799,7 @@ static int tcp_should_expand_sndbuf(stru return 0; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_memory_pressure) + if (ub_tcp_memory_pressure(sk)) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ @@ -5288,6 +5302,10 @@ int tcp_rcv_established(struct sock *sk, if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; + /* This is OK not to try to free memory here. + * Do this below on slow path. Den */ + if (ub_tcprcvbuf_charge(sk, skb) < 0) + goto step5; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); diff -urNp linux-2.6.32.48/net/ipv4/tcp_ipv4.c linux-2.6.32.48-openvz/net/ipv4/tcp_ipv4.c --- linux-2.6.32.48/net/ipv4/tcp_ipv4.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/tcp_ipv4.c 2011-11-21 17:40:47.000000000 -0500 @@ -73,6 +73,8 @@ #include #include +#include + #include #include #include @@ -716,7 +718,8 @@ static void tcp_v4_timewait_ack(struct s struct tcp_timewait_sock *tcptw = tcp_twsk(sk); tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, - tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_rcv_wnd >> + (tw->tw_rcv_wscale & TW_WSCALE_MASK), tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -1195,6 +1198,7 @@ struct request_sock_ops tcp_request_sock .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, }; +EXPORT_SYMBOL_GPL(tcp_request_sock_ops); #ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { @@ -1496,6 +1500,10 @@ static __sum16 tcp_v4_checksum_init(stru int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; + struct user_beancounter *ub; + + ub = set_exec_ub(sock_bc(sk)->ub); + #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible @@ -1514,7 +1522,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc goto reset; } TCP_CHECK_TIMER(sk); - return 0; + goto restore_context; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) @@ -1530,7 +1538,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc rsk = nsk; goto reset; } - return 0; + goto restore_context; } } @@ -1540,6 +1548,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc goto reset; } TCP_CHECK_TIMER(sk); + +restore_context: + (void)set_exec_ub(ub); return 0; reset: @@ -1551,7 +1562,7 @@ discard: * might be destroyed here. This current version compiles correctly, * but you have been warned. */ - return 0; + goto restore_context; csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); @@ -1813,6 +1824,8 @@ static int tcp_v4_init_sock(struct sock tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; + tp->advmss = 65535; /* max value */ + tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; @@ -1876,6 +1889,8 @@ void tcp_v4_destroy_sock(struct sock *sk * If sendmsg cached page exists, toss it. */ if (sk->sk_sndmsg_page) { + /* queue is empty, uncharge */ + ub_sock_tcp_detachpage(sk); __free_page(sk->sk_sndmsg_page); sk->sk_sndmsg_page = NULL; } @@ -1950,7 +1965,9 @@ get_req: } get_sk: sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { cur = sk; goto out; } @@ -2479,6 +2496,93 @@ void __init tcp_v4_init(void) panic("Failed to create the TCP control socket.\n"); } +#ifdef CONFIG_VE +static void tcp_kill_ve_onesk(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Check the assumed state of the socket. */ + if (!sock_flag(sk, SOCK_DEAD)) { + printk(KERN_WARNING "Killing sk: dead %d, state %d, " + "wrseq %u unseq %u, wrqu %d.\n", + sock_flag(sk, SOCK_DEAD), sk->sk_state, + tp->write_seq, tp->snd_una, + !skb_queue_empty(&sk->sk_write_queue)); + sk->sk_err = ECONNRESET; + sk->sk_error_report(sk); + } + + tcp_send_active_reset(sk, GFP_ATOMIC); + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + case TCP_CLOSING: + /* In these 2 states the peer may want us to retransmit + * some data and/or FIN. Entering "resetting mode" + * instead. + */ + tcp_time_wait(sk, TCP_CLOSE, 0); + break; + case TCP_FIN_WAIT2: + /* By some reason the socket may stay in this state + * without turning into a TW bucket. Fix it. + */ + tcp_time_wait(sk, TCP_FIN_WAIT2, 0); + break; + default: + /* Just jump into CLOSED state. */ + tcp_done(sk); + break; + } +} + +void tcp_v4_kill_ve_sockets(struct ve_struct *envid) +{ + struct inet_ehash_bucket *head; + int i, retry; + + /* alive */ +again: + retry = 0; + local_bh_disable(); + head = tcp_hashinfo.ehash; + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + struct sock *sk; + struct hlist_nulls_node *node; + spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); +more_work: + spin_lock(lock); + sk_nulls_for_each(sk, node, &head[i].chain) { + if (ve_accessible_strict(sk->owner_env, envid)) { + sock_hold(sk); + spin_unlock(lock); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + retry = 1; + bh_unlock_sock(sk); + sock_put(sk); + break; + } + /* sk might have disappeared from the hash before + * we got the lock */ + if (sk->sk_state != TCP_CLOSE) + tcp_kill_ve_onesk(sk); + bh_unlock_sock(sk); + sock_put(sk); + goto more_work; + } + } + spin_unlock(lock); + } + local_bh_enable(); + if (retry) { + schedule_timeout_interruptible(HZ); + goto again; + } +} +EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); +#endif + EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_prot); diff -urNp linux-2.6.32.48/net/ipv4/tcp_minisocks.c linux-2.6.32.48-openvz/net/ipv4/tcp_minisocks.c --- linux-2.6.32.48/net/ipv4/tcp_minisocks.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/tcp_minisocks.c 2011-11-21 17:40:47.000000000 -0500 @@ -26,6 +26,9 @@ #include #include +#include +#include + #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ #else @@ -36,6 +39,11 @@ int sysctl_tcp_syncookies __read_mostly EXPORT_SYMBOL(sysctl_tcp_syncookies); int sysctl_tcp_abort_on_overflow __read_mostly; +int sysctl_tcp_max_tw_kmem_fraction __read_mostly = 384; +int sysctl_tcp_max_tw_buckets_ub __read_mostly = 16536; + +EXPORT_SYMBOL(sysctl_tcp_max_tw_kmem_fraction); +EXPORT_SYMBOL(sysctl_tcp_max_tw_buckets_ub); struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, @@ -51,6 +59,7 @@ struct inet_timewait_death_row tcp_death .twcal_hand = -1, .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, (unsigned long)&tcp_death_row), + .ub_managed = 1, }; EXPORT_SYMBOL_GPL(tcp_death_row); @@ -280,7 +289,8 @@ void tcp_time_wait(struct sock *sk, int if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets && + ub_timewait_check(sk, &tcp_death_row)) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { @@ -293,6 +303,8 @@ void tcp_time_wait(struct sock *sk, int tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + if (sk->sk_user_data != NULL) + tw->tw_rcv_wscale |= TW_WSCALE_SPEC; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { @@ -327,6 +339,7 @@ void tcp_time_wait(struct sock *sk, int } } while (0); #endif + tw->tw_owner_env = VEID(sk->owner_env); /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); @@ -347,11 +360,16 @@ void tcp_time_wait(struct sock *sk, int TCP_TIMEWAIT_LEN); inet_twsk_put(tw); } else { + int ubid = 0; /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ - LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + ubid = top_beancounter(sock_bc(sk)->ub)->ub_uid; +#endif + LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow (CT%d)\n", ubid); } tcp_update_metrics(sk); @@ -392,6 +410,8 @@ struct sock *tcp_create_openreq_child(st struct tcp_sock *newtp; /* Now setup tcp_sock */ + newsk->owner_env = sk->owner_env; + newtp = tcp_sk(newsk); newtp->pred_flags = 0; newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; diff -urNp linux-2.6.32.48/net/ipv4/tcp_output.c linux-2.6.32.48-openvz/net/ipv4/tcp_output.c --- linux-2.6.32.48/net/ipv4/tcp_output.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/tcp_output.c 2011-11-21 17:40:47.000000000 -0500 @@ -39,6 +39,9 @@ #include #include +#include +#include + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse __read_mostly = 1; @@ -353,11 +356,6 @@ static void tcp_init_nondata_skb(struct TCP_SKB_CB(skb)->end_seq = seq; } -static inline int tcp_urg_mode(const struct tcp_sock *tp) -{ - return tp->snd_una != tp->snd_up; -} - #define OPTION_SACK_ADVERTISE (1 << 0) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) @@ -598,6 +596,13 @@ static unsigned tcp_established_options( return size; } +static int skb_header_size(struct sock *sk, int tcp_hlen) +{ + struct ip_options *opt = inet_sk(sk)->opt; + return tcp_hlen + sizeof(struct iphdr) + + (opt ? opt->optlen : 0) + ETH_HLEN /* For hard header */; +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -622,6 +627,7 @@ static int tcp_transmit_skb(struct sock __u8 *md5_hash_location; struct tcphdr *th; int err; + int header_size; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -652,6 +658,20 @@ static int tcp_transmit_skb(struct sock &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); + /* Unfortunately, we can have skb from outside world here + * with size insufficient for header. It is impossible to make + * guess when we queue skb, so the decision should be made + * here. Den + */ + header_size = skb_header_size(sk, tcp_header_size); + if (skb->data - header_size < skb->head) { + int delta = header_size - skb_headroom(skb); + err = pskb_expand_head(skb, SKB_DATA_ALIGN(delta), + 0, GFP_ATOMIC); + if (err) + return err; + } + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); @@ -824,15 +844,21 @@ int tcp_fragment(struct sock *sk, struct if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; + if (skb_cloned(skb) && skb_is_nonlinear(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + ub_skb_uncharge(skb); + ub_tcpsndbuf_charge_forced(sk, skb); + } /* Get a new skb... force flag on. */ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOMEM; + } sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); @@ -1299,6 +1325,11 @@ static int tso_fragment(struct sock *sk, if (unlikely(buff == NULL)) return -ENOMEM; + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOMEM; + } + sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; @@ -1728,7 +1759,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (tcp_memory_pressure) + if (ub_tcp_shrink_rcvbuf(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -2145,6 +2176,7 @@ void tcp_send_fin(struct sock *sk) break; yield(); } + ub_tcpsndbuf_charge_forced(sk, skb); /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); @@ -2204,6 +2236,10 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) return -ENOMEM; + if (ub_tcpsndbuf_charge(sk, nskb) < 0) { + kfree_skb(nskb); + return -ENOMEM; + } tcp_unlink_write_queue(skb, sk); skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); @@ -2313,6 +2349,7 @@ static void tcp_connect_init(struct sock struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + static int once = 0; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -2332,11 +2369,25 @@ static void tcp_connect_init(struct sock tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); + if (!once && dst_metric(dst, RTAX_ADVMSS) == 0) { + once = 1; + + printk("Oops in connect_init! dst->advmss=%d\n", + dst_metric(dst, RTAX_ADVMSS)); + printk("dst: pmtu=%u\n", dst_metric(dst, RTAX_MTU)); + printk("sk->state=%d, tp: ack.rcv_mss=%d, mss_cache=%d, " + "advmss=%d, user_mss=%d\n", + sk->sk_state, inet_csk(sk)->icsk_ack.rcv_mss, + tp->mss_cache, tp->advmss, tp->rx_opt.user_mss); + } + if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) tp->advmss = tp->rx_opt.user_mss; + if (tp->advmss == 0) + tp->advmss = 1460; tcp_initialize_rcv_mss(sk); @@ -2377,6 +2428,10 @@ int tcp_connect(struct sock *sk) buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOBUFS; + } /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); diff -urNp linux-2.6.32.48/net/ipv4/tcp_timer.c linux-2.6.32.48-openvz/net/ipv4/tcp_timer.c --- linux-2.6.32.48/net/ipv4/tcp_timer.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/tcp_timer.c 2011-11-21 17:40:47.000000000 -0500 @@ -20,6 +20,8 @@ #include #include +#include +#include int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; @@ -76,7 +78,7 @@ static int tcp_out_of_resources(struct s if (sk->sk_err_soft) shift++; - if (tcp_too_many_orphans(sk, shift)) { + if (ub_too_many_orphans(sk, shift)) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); @@ -177,6 +179,9 @@ static void tcp_delack_timer(unsigned lo struct sock *sk = (struct sock *)data; struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct ve_struct *ve; + + ve = set_exec_env(sk->owner_env); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -231,6 +236,8 @@ out: out_unlock: bh_unlock_sock(sk); sock_put(sk); + + (void)set_exec_env(ve); } static void tcp_probe_timer(struct sock *sk) @@ -238,10 +245,13 @@ static void tcp_probe_timer(struct sock struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; + struct ve_struct *ve; + + ve = set_exec_env(sk->owner_env); if (tp->packets_out || !tcp_send_head(sk)) { icsk->icsk_probes_out = 0; - return; + goto out; } /* *WARNING* RFC 1122 forbids this @@ -267,7 +277,7 @@ static void tcp_probe_timer(struct sock max_probes = tcp_orphan_retries(sk, alive); if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) - return; + goto out; } if (icsk->icsk_probes_out > max_probes) { @@ -276,6 +286,9 @@ static void tcp_probe_timer(struct sock /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); } + +out: + (void)set_exec_env(ve); } /* @@ -286,6 +299,9 @@ void tcp_retransmit_timer(struct sock *s { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct ve_struct *ve; + + ve = set_exec_env(sk->owner_env); if (!tp->packets_out) goto out; @@ -391,7 +407,8 @@ out_reset_timer: if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) __sk_dst_reset(sk); -out:; +out: + (void)set_exec_env(ve); } static void tcp_write_timer(unsigned long data) @@ -399,6 +416,9 @@ static void tcp_write_timer(unsigned lon struct sock *sk = (struct sock *)data; struct inet_connection_sock *icsk = inet_csk(sk); int event; + struct ve_struct *ve; + + ve = set_exec_env(sk->owner_env); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -433,6 +453,8 @@ out: out_unlock: bh_unlock_sock(sk); sock_put(sk); + + (void)set_exec_env(ve); } /* @@ -463,6 +485,9 @@ static void tcp_keepalive_timer (unsigne struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 elapsed; + struct ve_struct *ve; + + ve = set_exec_env(sk->owner_env); /* Only process if socket is not in use. */ bh_lock_sock(sk); @@ -534,4 +559,5 @@ death: out: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(ve); } diff -urNp linux-2.6.32.48/net/ipv4/udp.c linux-2.6.32.48-openvz/net/ipv4/udp.c --- linux-2.6.32.48/net/ipv4/udp.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv4/udp.c 2011-11-21 17:40:47.000000000 -0500 @@ -138,6 +138,7 @@ static int udp_lib_lport_inuse(struct ne sk2 != sk && (bitmap || sk2->sk_hash == num) && (!sk2->sk_reuse || !sk->sk_reuse) && + sk->sk_reuse != 2 && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (*saddr_comp)(sk, sk2)) { diff -urNp linux-2.6.32.48/net/ipv6/addrconf.c linux-2.6.32.48-openvz/net/ipv6/addrconf.c --- linux-2.6.32.48/net/ipv6/addrconf.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/addrconf.c 2011-11-21 17:40:47.000000000 -0500 @@ -623,7 +623,7 @@ ipv6_add_addr(struct inet6_dev *idev, co goto out; } - ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC_UBC); if (ifa == NULL) { ADBG(("ipv6_add_addr: malloc failed\n")); @@ -2082,7 +2082,7 @@ err_exit: /* * Manual configuration of address on an interface */ -static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, +int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft) { @@ -2154,6 +2154,7 @@ static int inet6_addr_add(struct net *ne return PTR_ERR(ifp); } +EXPORT_SYMBOL_GPL(inet6_addr_add); static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, unsigned int plen) @@ -2185,7 +2186,8 @@ static int inet6_addr_del(struct net *ne disable IPv6 on this interface. */ if (idev->addr_list == NULL) - addrconf_ifdown(idev->dev, 1); + addrconf_ifdown(idev->dev, + !(idev->dev->flags & IFF_LOOPBACK)); return 0; } } @@ -2199,7 +2201,7 @@ int addrconf_add_ifaddr(struct net *net, struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) @@ -2218,7 +2220,7 @@ int addrconf_del_ifaddr(struct net *net, struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) @@ -2728,6 +2730,9 @@ static int addrconf_ifdown(struct net_de static void addrconf_rs_timer(unsigned long data) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct ve_struct *old_env; + + old_env = set_exec_env(ifp->idev->dev->owner_env); if (ifp->idev->cnf.forwarding) goto out; @@ -2762,6 +2767,7 @@ static void addrconf_rs_timer(unsigned l out: in6_ifa_put(ifp); + (void)set_exec_env(old_env); } /* @@ -2798,6 +2804,7 @@ static void addrconf_dad_start(struct in if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || + dev->owner_env->disable_net || ifp->flags & IFA_F_NODAD) { ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock_bh(&ifp->lock); @@ -2838,7 +2845,9 @@ static void addrconf_dad_timer(unsigned struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; struct inet6_dev *idev = ifp->idev; struct in6_addr mcaddr; + struct ve_struct *old_env; + old_env = set_exec_env(ifp->idev->dev->owner_env); read_lock_bh(&idev->lock); if (idev->dead) { read_unlock_bh(&idev->lock); @@ -2869,6 +2878,7 @@ static void addrconf_dad_timer(unsigned ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); + (void)set_exec_env(old_env); } static void addrconf_dad_completed(struct inet6_ifaddr *ifp) @@ -3090,6 +3100,7 @@ static void addrconf_verify(unsigned lon struct inet6_ifaddr *ifp; unsigned long now, next; int i; + struct ve_struct *old_env; spin_lock_bh(&addrconf_verify_lock); now = jiffies; @@ -3110,6 +3121,8 @@ restart: if (ifp->flags & IFA_F_PERMANENT) continue; + old_env = set_exec_env(ifp->idev->dev->owner_env); + spin_lock(&ifp->lock); age = (now - ifp->tstamp) / HZ; @@ -3125,9 +3138,11 @@ restart: in6_ifa_hold(ifp); read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); + (void)set_exec_env(old_env); goto restart; } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { spin_unlock(&ifp->lock); + set_exec_env(old_env); continue; } else if (age >= ifp->prefered_lft) { /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */ @@ -3149,6 +3164,7 @@ restart: ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); + (void)set_exec_env(old_env); goto restart; } #ifdef CONFIG_IPV6_PRIVACY @@ -3170,6 +3186,7 @@ restart: ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); + (void)set_exec_env(old_env); goto restart; } } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) @@ -3182,6 +3199,7 @@ restart: next = ifp->tstamp + ifp->prefered_lft * HZ; spin_unlock(&ifp->lock); } + (void)set_exec_env(old_env); } read_unlock(&addrconf_hash_lock); } diff -urNp linux-2.6.32.48/net/ipv6/af_inet6.c linux-2.6.32.48-openvz/net/ipv6/af_inet6.c --- linux-2.6.32.48/net/ipv6/af_inet6.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/af_inet6.c 2011-11-21 17:40:47.000000000 -0500 @@ -57,6 +57,10 @@ #ifdef CONFIG_IPV6_TUNNEL #include #endif +#ifdef CONFIG_IPV6_MIP6 +#include +#endif +#include #include #include @@ -157,6 +161,10 @@ lookup_protocol: goto out_rcu_unlock; } + err = vz_security_protocol_check(answer->protocol); + if (err < 0) + goto out_rcu_unlock; + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; @@ -174,6 +182,13 @@ lookup_protocol: if (sk == NULL) goto out; + err = -ENOBUFS; + if (ub_sock_charge(sk, PF_INET6, sock->type)) + goto out_sk_free; + /* if charge was successful, sock_init_data() MUST be called to + * set sk->sk_type. otherwise sk will be uncharged to wrong resource + */ + sock_init_data(sock, sk); err = 0; @@ -248,6 +263,9 @@ out: out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_free: + sk_free(sk); + return err; } diff -urNp linux-2.6.32.48/net/ipv6/ip6_fib.c linux-2.6.32.48-openvz/net/ipv6/ip6_fib.c --- linux-2.6.32.48/net/ipv6/ip6_fib.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/ip6_fib.c 2011-11-21 17:40:47.000000000 -0500 @@ -176,11 +176,9 @@ static void fib6_link_table(struct net * h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); - /* - * No protection necessary, this is the only list mutatation - * operation, tables never disappear once they exist. - */ + write_lock_bh(&tb->tb6_lock); hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); + write_unlock_bh(&tb->tb6_lock); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -1365,10 +1363,14 @@ void fib6_clean_all(struct net *net, int for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { + struct ve_struct *old_env; + + old_env = set_exec_env(table->owner_env); write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, prune, arg); write_unlock_bh(&table->tb6_lock); + (void)set_exec_env(old_env); } } rcu_read_unlock(); @@ -1488,6 +1490,9 @@ static int fib6_net_init(struct net *net if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; +#ifdef CONFIG_VE + net->ipv6.fib6_main_tbl->owner_env = get_exec_env(); +#endif net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; net->ipv6.fib6_main_tbl->tb6_root.fn_flags = @@ -1498,6 +1503,10 @@ static int fib6_net_init(struct net *net GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; + +#ifdef CONFIG_VE + net->ipv6.fib6_local_tbl->owner_env = get_exec_env(); +#endif net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; net->ipv6.fib6_local_tbl->tb6_root.fn_flags = @@ -1543,7 +1552,7 @@ int __init fib6_init(void) fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN, + 0, SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (!fib6_node_kmem) goto out; diff -urNp linux-2.6.32.48/net/ipv6/ip6_output.c linux-2.6.32.48-openvz/net/ipv6/ip6_output.c --- linux-2.6.32.48/net/ipv6/ip6_output.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/ip6_output.c 2011-11-21 17:40:47.000000000 -0500 @@ -522,6 +522,20 @@ int ip6_forward(struct sk_buff *skb) return -EMSGSIZE; } + /* + * We try to optimize forwarding of VE packets: + * do not decrement TTL (and so save skb_cow) + * during forwarding of outgoing pkts from VE. + * For incoming pkts we still do ttl decr, + * since such skb is not cloned and does not require + * actual cow. So, there is at least one place + * in pkts path with mandatory ttl decr, that is + * sufficient to prevent routing loops. + */ + hdr = ipv6_hdr(skb); + if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */ + goto no_ttl_decr; + if (skb_cow(skb, dst->dev->hard_header_len)) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; @@ -533,6 +547,7 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; +no_ttl_decr: IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); diff -urNp linux-2.6.32.48/net/ipv6/mcast.c linux-2.6.32.48-openvz/net/ipv6/mcast.c --- linux-2.6.32.48/net/ipv6/mcast.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/mcast.c 2011-11-21 17:40:47.000000000 -0500 @@ -243,6 +243,7 @@ int ipv6_sock_mc_join(struct sock *sk, i return 0; } +EXPORT_SYMBOL_GPL(ipv6_sock_mc_join); /* * socket leave on multicast group @@ -2205,15 +2206,18 @@ static void igmp6_leave_group(struct ifm static void mld_gq_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); idev->mc_gq_running = 0; mld_send_report(idev, NULL); __in6_dev_put(idev); + set_exec_env(old_env); } static void mld_ifc_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); mld_send_cr(idev); if (idev->mc_ifc_count) { @@ -2222,6 +2226,7 @@ static void mld_ifc_timer_expire(unsigne mld_ifc_start_timer(idev, idev->mc_maxdelay); } __in6_dev_put(idev); + set_exec_env(old_env); } static void mld_ifc_event(struct inet6_dev *idev) @@ -2236,6 +2241,7 @@ static void mld_ifc_event(struct inet6_d static void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env); if (MLD_V1_SEEN(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); @@ -2247,6 +2253,7 @@ static void igmp6_timer_handler(unsigned ma->mca_flags &= ~MAF_TIMER_RUNNING; spin_unlock(&ma->mca_lock); ma_put(ma); + set_exec_env(old_env); } /* Device changing type */ diff -urNp linux-2.6.32.48/net/ipv6/netfilter/ip6_queue.c linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6_queue.c --- linux-2.6.32.48/net/ipv6/netfilter/ip6_queue.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6_queue.c 2011-11-21 17:40:47.000000000 -0500 @@ -439,7 +439,7 @@ __ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); write_lock_bh(&queue_lock); @@ -469,8 +469,12 @@ __ipq_rcv_skb(struct sk_buff *skb) static void ipq_rcv_skb(struct sk_buff *skb) { + struct ve_struct *old_ve; + mutex_lock(&ipqnl_mutex); + old_ve = set_exec_env(skb->owner_env); __ipq_rcv_skb(skb); + (void)set_exec_env(old_ve); mutex_unlock(&ipqnl_mutex); } @@ -480,9 +484,6 @@ ipq_rcv_dev_event(struct notifier_block { struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) ipq_dev_drop(dev->ifindex); @@ -502,7 +503,7 @@ ipq_rcv_nl_event(struct notifier_block * if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW && n->pid) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if (n->pid == peer_pid) __ipq_reset(); write_unlock_bh(&queue_lock); } diff -urNp linux-2.6.32.48/net/ipv6/netfilter/ip6table_filter.c linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6table_filter.c --- linux-2.6.32.48/net/ipv6/netfilter/ip6table_filter.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6table_filter.c 2011-11-21 17:40:47.000000000 -0500 @@ -121,16 +121,24 @@ module_param(forward, bool, 0000); static int __net_init ip6table_filter_net_init(struct net *net) { + if (!net_ipt_permitted(net, VE_IP_FILTER6)) + return 0; + /* Register table */ net->ipv6.ip6table_filter = ip6t_register_table(net, &packet_filter, &initial_table.repl); if (IS_ERR(net->ipv6.ip6table_filter)) return PTR_ERR(net->ipv6.ip6table_filter); + + net_ipt_module_set(net, VE_IP_FILTER6); return 0; } static void __net_exit ip6table_filter_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_FILTER6)) + return; + ip6t_unregister_table(net->ipv6.ip6table_filter); } diff -urNp linux-2.6.32.48/net/ipv6/netfilter/ip6table_mangle.c linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6table_mangle.c --- linux-2.6.32.48/net/ipv6/netfilter/ip6table_mangle.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6table_mangle.c 2011-11-21 17:40:47.000000000 -0500 @@ -172,16 +172,24 @@ static struct nf_hook_ops ip6t_ops[] __r static int __net_init ip6table_mangle_net_init(struct net *net) { + if (!net_ipt_permitted(net, VE_IP_MANGLE6)) + return 0; + /* Register table */ net->ipv6.ip6table_mangle = ip6t_register_table(net, &packet_mangler, &initial_table.repl); if (IS_ERR(net->ipv6.ip6table_mangle)) return PTR_ERR(net->ipv6.ip6table_mangle); + + net_ipt_module_set(net, VE_IP_MANGLE6); return 0; } static void __net_exit ip6table_mangle_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_MANGLE6)) + return; + ip6t_unregister_table(net->ipv6.ip6table_mangle); } diff -urNp linux-2.6.32.48/net/ipv6/netfilter/ip6_tables.c linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6_tables.c --- linux-2.6.32.48/net/ipv6/netfilter/ip6_tables.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6_tables.c 2011-11-21 17:40:47.000000000 -0500 @@ -351,6 +351,9 @@ ip6t_do_table(struct sk_buff *skb, struct xt_match_param mtpar; struct xt_target_param tgpar; + if (ve_xt_table_forbidden(table)) + return NF_ACCEPT; + /* Initialization */ indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; @@ -1898,7 +1901,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2009,7 +2012,7 @@ compat_do_ip6t_get_ctl(struct sock *sk, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2031,7 +2034,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2056,7 +2059,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2110,7 +2113,7 @@ struct xt_table *ip6t_register_table(str int ret; struct xt_table_info *newinfo; struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; struct xt_table *new_table; @@ -2255,11 +2258,22 @@ static struct xt_match icmp6_matchstruct static int __net_init ip6_tables_net_init(struct net *net) { - return xt_proto_init(net, NFPROTO_IPV6); + int res; + + if (!net_ipt_permitted(net, VE_IP_IPTABLES6)) + return 0; + + res = xt_proto_init(net, NFPROTO_IPV6); + if (!res) + net_ipt_module_set(net, VE_IP_IPTABLES6); + return res; } static void __net_exit ip6_tables_net_exit(struct net *net) { + if (!net_is_ipt_module_set(net, VE_IP_IPTABLES6)) + return; + xt_proto_fini(net, NFPROTO_IPV6); } diff -urNp linux-2.6.32.48/net/ipv6/netfilter/ip6t_LOG.c linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6t_LOG.c --- linux-2.6.32.48/net/ipv6/netfilter/ip6t_LOG.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/netfilter/ip6t_LOG.c 2011-11-21 17:40:47.000000000 -0500 @@ -56,15 +56,15 @@ static void dump_packet(const struct nf_ ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (ih == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ - printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + ve_printk(VE_LOG, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ - printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ve_printk(VE_LOG, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", ntohs(ih->payload_len) + sizeof(struct ipv6hdr), (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, ih->hop_limit, @@ -79,35 +79,35 @@ static void dump_packet(const struct nf_ hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); if (hp == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 48 "OPT (...) " */ if (logflags & IP6T_LOG_IPOPT) - printk("OPT ( "); + ve_printk(VE_LOG, "OPT ( "); switch (currenthdr) { case IPPROTO_FRAGMENT: { struct frag_hdr _fhdr; const struct frag_hdr *fh; - printk("FRAG:"); + ve_printk(VE_LOG, "FRAG:"); fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), &_fhdr); if (fh == NULL) { - printk("TRUNCATED "); + ve_printk(VE_LOG, "TRUNCATED "); return; } /* Max length: 6 "65535 " */ - printk("%u ", ntohs(fh->frag_off) & 0xFFF8); + ve_printk(VE_LOG, "%u ", ntohs(fh->frag_off) & 0xFFF8); /* Max length: 11 "INCOMPLETE " */ if (fh->frag_off & htons(0x0001)) - printk("INCOMPLETE "); + ve_printk(VE_LOG, "INCOMPLETE "); - printk("ID:%08x ", ntohl(fh->identification)); + ve_printk(VE_LOG, "ID:%08x ", ntohl(fh->identification)); if (ntohs(fh->frag_off) & 0xFFF8) fragment = 1; @@ -121,7 +121,7 @@ static void dump_packet(const struct nf_ case IPPROTO_HOPOPTS: if (fragment) { if (logflags & IP6T_LOG_IPOPT) - printk(")"); + ve_printk(VE_LOG, ")"); return; } hdrlen = ipv6_optlen(hp); @@ -133,10 +133,10 @@ static void dump_packet(const struct nf_ const struct ip_auth_hdr *ah; /* Max length: 3 "AH " */ - printk("AH "); + ve_printk(VE_LOG, "AH "); if (fragment) { - printk(")"); + ve_printk(VE_LOG, ")"); return; } @@ -147,13 +147,13 @@ static void dump_packet(const struct nf_ * Max length: 26 "INCOMPLETE [65535 * bytes] )" */ - printk("INCOMPLETE [%u bytes] )", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] )", skb->len - ptr); return; } /* Length: 15 "SPI=0xF1234567 */ - printk("SPI=0x%x ", ntohl(ah->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi)); } @@ -165,10 +165,10 @@ static void dump_packet(const struct nf_ const struct ip_esp_hdr *eh; /* Max length: 4 "ESP " */ - printk("ESP "); + ve_printk(VE_LOG, "ESP "); if (fragment) { - printk(")"); + ve_printk(VE_LOG, ")"); return; } @@ -178,23 +178,23 @@ static void dump_packet(const struct nf_ eh = skb_header_pointer(skb, ptr, sizeof(_esph), &_esph); if (eh == NULL) { - printk("INCOMPLETE [%u bytes] )", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] )", skb->len - ptr); return; } /* Length: 16 "SPI=0xF1234567 )" */ - printk("SPI=0x%x )", ntohl(eh->spi) ); + ve_printk(VE_LOG, "SPI=0x%x )", ntohl(eh->spi) ); } return; default: /* Max length: 20 "Unknown Ext Hdr 255" */ - printk("Unknown Ext Hdr %u", currenthdr); + ve_printk(VE_LOG, "Unknown Ext Hdr %u", currenthdr); return; } if (logflags & IP6T_LOG_IPOPT) - printk(") "); + ve_printk(VE_LOG, ") "); currenthdr = hp->nexthdr; ptr += hdrlen; @@ -206,7 +206,7 @@ static void dump_packet(const struct nf_ const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); + ve_printk(VE_LOG, "PROTO=TCP "); if (fragment) break; @@ -214,40 +214,40 @@ static void dump_packet(const struct nf_ /* Max length: 25 "INCOMPLETE [65535 bytes] " */ th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph); if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - ptr); return; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & IP6T_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", + ve_printk(VE_LOG, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); + ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3C " */ - printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + ve_printk(VE_LOG, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) - printk("CWR "); + ve_printk(VE_LOG, "CWR "); if (th->ece) - printk("ECE "); + ve_printk(VE_LOG, "ECE "); if (th->urg) - printk("URG "); + ve_printk(VE_LOG, "URG "); if (th->ack) - printk("ACK "); + ve_printk(VE_LOG, "ACK "); if (th->psh) - printk("PSH "); + ve_printk(VE_LOG, "PSH "); if (th->rst) - printk("RST "); + ve_printk(VE_LOG, "RST "); if (th->syn) - printk("SYN "); + ve_printk(VE_LOG, "SYN "); if (th->fin) - printk("FIN "); + ve_printk(VE_LOG, "FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); + ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & IP6T_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { @@ -261,15 +261,15 @@ static void dump_packet(const struct nf_ ptr + sizeof(struct tcphdr), optsize, _opt); if (op == NULL) { - printk("OPT (TRUNCATED)"); + ve_printk(VE_LOG, "OPT (TRUNCATED)"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i =0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } break; } @@ -280,9 +280,9 @@ static void dump_packet(const struct nf_ if (currenthdr == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); + ve_printk(VE_LOG, "PROTO=UDP " ); else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); + ve_printk(VE_LOG, "PROTO=UDPLITE "); if (fragment) break; @@ -290,12 +290,12 @@ static void dump_packet(const struct nf_ /* Max length: 25 "INCOMPLETE [65535 bytes] " */ uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph); if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - ptr); return; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); break; @@ -305,7 +305,7 @@ static void dump_packet(const struct nf_ const struct icmp6hdr *ic; /* Max length: 13 "PROTO=ICMPv6 " */ - printk("PROTO=ICMPv6 "); + ve_printk(VE_LOG, "PROTO=ICMPv6 "); if (fragment) break; @@ -313,18 +313,18 @@ static void dump_packet(const struct nf_ /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); if (ic == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - ptr); return; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); switch (ic->icmp6_type) { case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", + ve_printk(VE_LOG, "ID=%u SEQ=%u ", ntohs(ic->icmp6_identifier), ntohs(ic->icmp6_sequence)); break; @@ -335,35 +335,35 @@ static void dump_packet(const struct nf_ case ICMPV6_PARAMPROB: /* Max length: 17 "POINTER=ffffffff " */ - printk("POINTER=%08x ", ntohl(ic->icmp6_pointer)); + ve_printk(VE_LOG, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); /* Fall through */ case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: /* Max length: 3+maxlen */ if (recurse) { - printk("["); + ve_printk(VE_LOG, "["); dump_packet(info, skb, ptr + sizeof(_icmp6h), 0); - printk("] "); + ve_printk(VE_LOG, "] "); } /* Max length: 10 "MTU=65535 " */ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) - printk("MTU=%u ", ntohl(ic->icmp6_mtu)); + ve_printk(VE_LOG, "MTU=%u ", ntohl(ic->icmp6_mtu)); } break; } /* Max length: 10 "PROTO=255 " */ default: - printk("PROTO=%u ", currenthdr); + ve_printk(VE_LOG, "PROTO=%u ", currenthdr); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u ", + ve_printk(VE_LOG, "UID=%u GID=%u ", skb->sk->sk_socket->file->f_cred->fsuid, skb->sk->sk_socket->file->f_cred->fsgid); read_unlock_bh(&skb->sk->sk_callback_lock); @@ -371,7 +371,7 @@ static void dump_packet(const struct nf_ /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!recurse && skb->mark) - printk("MARK=0x%x ", skb->mark); + ve_printk(VE_LOG, "MARK=0x%x ", skb->mark); } static struct nf_loginfo default_loginfo = { @@ -397,14 +397,14 @@ ip6t_log_packet(u_int8_t pf, loginfo = &default_loginfo; spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); if (in && !out) { unsigned int len; /* MAC logging for input chain only. */ - printk("MAC="); + ve_printk(VE_LOG, "MAC="); if (skb->dev && (len = skb->dev->hard_header_len) && skb->mac_header != skb->network_header) { const unsigned char *p = skb_mac_header(skb); @@ -416,23 +416,23 @@ ip6t_log_packet(u_int8_t pf, if (p != NULL) { for (i = 0; i < len; i++) - printk("%02x%s", p[i], + ve_printk(VE_LOG, "%02x%s", p[i], i == len - 1 ? "" : ":"); } - printk(" "); + ve_printk(VE_LOG, " "); if (skb->dev->type == ARPHRD_SIT) { const struct iphdr *iph = (struct iphdr *)skb_mac_header(skb); - printk("TUNNEL=%pI4->%pI4 ", + ve_printk(VE_LOG, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr); } } else - printk(" "); + ve_printk(VE_LOG, " "); } dump_packet(loginfo, skb, skb_network_offset(skb), 1); - printk("\n"); + ve_printk(VE_LOG, "\n"); spin_unlock_bh(&log_lock); } diff -urNp linux-2.6.32.48/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c linux-2.6.32.48-openvz/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c --- linux-2.6.32.48/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c 2011-11-21 17:40:47.000000000 -0500 @@ -210,12 +210,13 @@ static unsigned int ipv6_defrag(unsigned int (*okfn)(struct sk_buff *)) { struct sk_buff *reasm; + struct net *net = out ? dev_net(out) : dev_net(in); /* Previously seen (loopback)? */ if (skb->nfct) return NF_ACCEPT; - reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); + reasm = nf_ct_frag6_gather(net, skb, nf_ct6_defrag_user(hooknum, skb)); /* queued */ if (reasm == NULL) return NF_STOLEN; diff -urNp linux-2.6.32.48/net/ipv6/netfilter/nf_conntrack_reasm.c linux-2.6.32.48-openvz/net/ipv6/netfilter/nf_conntrack_reasm.c --- linux-2.6.32.48/net/ipv6/netfilter/nf_conntrack_reasm.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/netfilter/nf_conntrack_reasm.c 2011-11-21 17:40:47.000000000 -0500 @@ -118,11 +118,12 @@ static void nf_skb_free(struct sk_buff * } /* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) +static inline void frag_kfree_skb(struct netns_frags *nf, + struct sk_buff *skb, unsigned int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &nf_init_frags.mem); + atomic_sub(skb->truesize, &nf->mem); nf_skb_free(skb); kfree_skb(skb); } @@ -142,10 +143,10 @@ static __inline__ void fq_kill(struct nf inet_frag_kill(&fq->q, &nf_frags); } -static void nf_ct_frag6_evictor(void) +static void nf_ct_frag6_evictor(struct netns_frags *nf) { local_bh_disable(); - inet_frag_evictor(&nf_init_frags, &nf_frags); + inet_frag_evictor(nf, &nf_frags); local_bh_enable(); } @@ -171,7 +172,7 @@ out: /* Creation primitives. */ static __inline__ struct nf_ct_frag6_queue * -fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst) +fq_find(struct net *net, __be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -185,7 +186,7 @@ fq_find(__be32 id, u32 user, struct in6_ read_lock_bh(&nf_frags.lock); hash = inet6_hash_frag(id, src, dst, nf_frags.rnd); - q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); + q = inet_frag_find(&net->ipv6.ct_frags, &nf_frags, &arg, hash); local_bh_enable(); if (q == NULL) goto oom; @@ -198,7 +199,8 @@ oom: } -static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, +static int nf_ct_frag6_queue(struct net *net, struct nf_ct_frag6_queue *fq, + struct sk_buff *skb, const struct frag_hdr *fhdr, int nhoff) { struct sk_buff *prev, *next; @@ -339,7 +341,7 @@ static int nf_ct_frag6_queue(struct nf_c fq->q.fragments = next; fq->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); + frag_kfree_skb(fq->q.net, free_it, NULL); } } @@ -355,7 +357,7 @@ static int nf_ct_frag6_queue(struct nf_c skb->dev = NULL; fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &nf_init_frags.mem); + atomic_add(skb->truesize, &net->ipv6.ct_frags.mem); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -365,7 +367,7 @@ static int nf_ct_frag6_queue(struct nf_c fq->q.last_in |= INET_FRAG_FIRST_IN; } write_lock(&nf_frags.lock); - list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); + list_move_tail(&fq->q.lru_list, &net->ipv6.ct_frags.lru_list); write_unlock(&nf_frags.lock); return 0; @@ -383,7 +385,8 @@ err: * the last and the first frames arrived and all the bits are here. */ static struct sk_buff * -nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +nf_ct_frag6_reasm(struct net *net, struct nf_ct_frag6_queue *fq, + struct net_device *dev) { struct sk_buff *fp, *op, *head = fq->q.fragments; int payload_len; @@ -432,7 +435,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_que clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &nf_init_frags.mem); + atomic_add(clone->truesize, &net->ipv6.ct_frags.mem); } /* We have to remove fragment header from datagram and to relocate @@ -446,7 +449,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_que skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &nf_init_frags.mem); + atomic_sub(head->truesize, &net->ipv6.ct_frags.mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -456,7 +459,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_que else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &nf_init_frags.mem); + atomic_sub(fp->truesize, &net->ipv6.ct_frags.mem); } head->next = NULL; @@ -563,7 +566,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 * return 0; } -struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) +struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { struct sk_buff *clone; struct net_device *dev = skb->dev; @@ -600,10 +603,11 @@ struct sk_buff *nf_ct_frag6_gather(struc hdr = ipv6_hdr(clone); fhdr = (struct frag_hdr *)skb_transport_header(clone); - if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) - nf_ct_frag6_evictor(); + if (atomic_read(&net->ipv6.ct_frags.mem) > + net->ipv6.ct_frags.high_thresh) + nf_ct_frag6_evictor(&net->ipv6.ct_frags); - fq = fq_find(fhdr->identification, user, &hdr->saddr, &hdr->daddr); + fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); goto ret_orig; @@ -611,7 +615,7 @@ struct sk_buff *nf_ct_frag6_gather(struc spin_lock_bh(&fq->q.lock); - if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { + if (nf_ct_frag6_queue(net, fq, clone, fhdr, nhoff) < 0) { spin_unlock_bh(&fq->q.lock); pr_debug("Can't insert skb to queue\n"); fq_put(fq); @@ -620,7 +624,7 @@ struct sk_buff *nf_ct_frag6_gather(struc if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { - ret_skb = nf_ct_frag6_reasm(fq, dev); + ret_skb = nf_ct_frag6_reasm(net, fq, dev); if (ret_skb == NULL) pr_debug("Can't reassemble fragmented packets\n"); } @@ -655,8 +659,32 @@ void nf_ct_frag6_output(unsigned int hoo nf_conntrack_put_reasm(skb); } +static int nf_ct_frag6_init_net(struct net *net) +{ + struct netns_frags *frags = &net->ipv6.ct_frags; + + frags->timeout = IPV6_FRAG_TIMEOUT; + frags->high_thresh = 256 * 1024; + frags->low_thresh = 192 * 1024; + inet_frags_init_net(frags); + + return 0; /* FIXME : sysctls */ +} + +static void nf_ct_frag6_exit_net(struct net *net) +{ + inet_frags_exit_net(&net->ipv6.ct_frags, &nf_frags); +} + +static struct pernet_operations nf_ct_frag6_ops = { + .init = nf_ct_frag6_init_net, + .exit = nf_ct_frag6_exit_net, +}; + int nf_ct_frag6_init(void) { + register_pernet_subsys(&nf_ct_frag6_ops); + nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; @@ -665,10 +693,6 @@ int nf_ct_frag6_init(void) nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.secret_interval = 10 * 60 * HZ; - nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; - nf_init_frags.high_thresh = 256 * 1024; - nf_init_frags.low_thresh = 192 * 1024; - inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); return 0; @@ -677,7 +701,5 @@ int nf_ct_frag6_init(void) void nf_ct_frag6_cleanup(void) { inet_frags_fini(&nf_frags); - - nf_init_frags.low_thresh = 0; - nf_ct_frag6_evictor(); + unregister_pernet_subsys(&nf_ct_frag6_ops); } diff -urNp linux-2.6.32.48/net/ipv6/reassembly.c linux-2.6.32.48-openvz/net/ipv6/reassembly.c --- linux-2.6.32.48/net/ipv6/reassembly.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/reassembly.c 2011-11-21 17:40:47.000000000 -0500 @@ -199,8 +199,10 @@ static void ip6_frag_expire(unsigned lon struct frag_queue *fq; struct net_device *dev = NULL; struct net *net; + struct ve_struct *old_ve; fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + old_ve = set_exec_env(fq->q.owner_ve); spin_lock(&fq->q.lock); @@ -235,6 +237,8 @@ out: dev_put(dev); spin_unlock(&fq->q.lock); fq_put(fq); + + (void)set_exec_env(old_ve); } static __inline__ struct frag_queue * @@ -515,6 +519,7 @@ static int ip6_frag_reasm(struct frag_qu clone->csum = 0; clone->ip_summed = head->ip_summed; atomic_add(clone->truesize, &fq->q.net->mem); + clone->owner_env = head->owner_env; } /* We have to remove fragment header from datagram and to relocate diff -urNp linux-2.6.32.48/net/ipv6/sit.c linux-2.6.32.48-openvz/net/ipv6/sit.c --- linux-2.6.32.48/net/ipv6/sit.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/sit.c 2011-11-21 18:19:20.000000000 -0500 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,9 @@ #include #include +#include +#include + /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -87,6 +91,9 @@ static struct ip_tunnel * ipip6_tunnel_l struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); + if (sitn == NULL) + return NULL; + for (t = sitn->tunnels_r_l[h0^h1]; t; t = t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && @@ -937,11 +944,14 @@ static int ipip6_tunnel_change_mtu(struc return 0; } +static void sit_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx); static const struct net_device_ops ipip6_netdev_ops = { .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = ipip6_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_change_mtu = ipip6_tunnel_change_mtu, + .ndo_cpt = sit_cpt, }; static void ipip6_tunnel_setup(struct net_device *dev) @@ -1011,11 +1021,116 @@ static void sit_destroy_tunnels(struct s } } +static void sit_cpt(struct net_device *dev, + struct cpt_ops *ops, struct cpt_context *ctx) +{ + struct cpt_tunnel_image v; + struct ip_tunnel *t; + struct sit_net *sitn; + + t = netdev_priv(dev); + sitn = net_generic(get_exec_env()->ve_netns, sit_net_id); + BUG_ON(sitn == NULL); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + /* mark fb dev */ + v.cpt_tnl_flags = CPT_TUNNEL_SIT; + if (dev == sitn->fb_tunnel_dev) + v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV; + + v.cpt_i_flags = t->parms.i_flags; + v.cpt_o_flags = t->parms.o_flags; + v.cpt_i_key = t->parms.i_key; + v.cpt_o_key = t->parms.o_key; + + BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph)); + memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph)); + + ops->write(&v, sizeof(v), ctx); +} + +static int sit_rst(loff_t start, struct cpt_netdev_image *di, + struct rst_ops *ops, struct cpt_context *ctx) +{ + int err = -ENODEV; + struct cpt_tunnel_image v; + struct net_device *dev; + struct ip_tunnel *t; + loff_t pos; + int fbdev; + struct sit_net *sitn; + + sitn = net_generic(get_exec_env()->ve_netns, sit_net_id); + if (sitn == NULL) + return -EOPNOTSUPP; + + pos = start + di->cpt_hdrlen; + err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL, + pos, &v, sizeof(v), ctx); + if (err) + return err; + + /* some sanity */ + if (v.cpt_content != CPT_CONTENT_VOID) + return -EINVAL; + + if (!(v.cpt_tnl_flags & CPT_TUNNEL_SIT)) + return 1; + + if (v.cpt_tnl_flags & CPT_TUNNEL_FBDEV) { + fbdev = 1; + err = 0; + dev = sitn->fb_tunnel_dev; + } else { + fbdev = 0; + err = -ENOMEM; + dev = alloc_netdev(sizeof(struct ip_tunnel), di->cpt_name, + ipip6_tunnel_setup); + if (!dev) + goto out; + } + + t = netdev_priv(dev); + t->parms.i_flags = v.cpt_i_flags; + t->parms.o_flags = v.cpt_o_flags; + t->parms.i_key = v.cpt_i_key; + t->parms.o_key = v.cpt_o_key; + + BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph)); + memcpy(&t->parms.iph, &v.cpt_iphdr, sizeof(t->parms.iph)); + + if (!fbdev) { + ipip6_tunnel_init(dev); + err = register_netdevice(dev); + if (err) { + free_netdev(dev); + goto out; + } + + dev_hold(dev); + ipip6_tunnel_link(sitn, t); + } +out: + return err; +} + +static struct netdev_rst sit_netdev_rst = { + .cpt_object = CPT_OBJ_NET_IPIP_TUNNEL, + .ndo_rst = sit_rst, +}; + static int sit_init_net(struct net *net) { int err; struct sit_net *sitn; + if (!(get_exec_env()->features & VE_FEATURE_SIT)) + return 0; + err = -ENOMEM; sitn = kzalloc(sizeof(struct sit_net), GFP_KERNEL); if (sitn == NULL) @@ -1061,6 +1176,9 @@ static void sit_exit_net(struct net *net struct sit_net *sitn; sitn = net_generic(net, sit_net_id); + if (sitn == NULL) /* no VE_FEATURE_SIT */ + return; + rtnl_lock(); sit_destroy_tunnels(sitn); unregister_netdevice(sitn->fb_tunnel_dev); @@ -1075,6 +1193,7 @@ static struct pernet_operations sit_net_ static void __exit sit_cleanup(void) { + unregister_netdev_rst(&sit_netdev_rst); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); unregister_pernet_gen_device(sit_net_id, &sit_net_ops); @@ -1091,9 +1210,16 @@ static int __init sit_init(void) return err; err = xfrm4_tunnel_register(&sit_handler, AF_INET6); if (err < 0) { - unregister_pernet_device(&sit_net_ops); + unregister_pernet_gen_device(sit_net_id, &sit_net_ops); printk(KERN_INFO "sit init: Can't add protocol\n"); + return err; } + err = register_netdev_rst(&sit_netdev_rst); + if (err < 0) { + xfrm4_tunnel_deregister(&sit_handler, AF_INET6); + unregister_pernet_gen_device(sit_net_id, &sit_net_ops); + } + return err; } diff -urNp linux-2.6.32.48/net/ipv6/tcp_ipv6.c linux-2.6.32.48-openvz/net/ipv6/tcp_ipv6.c --- linux-2.6.32.48/net/ipv6/tcp_ipv6.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/ipv6/tcp_ipv6.c 2011-11-21 17:40:47.000000000 -0500 @@ -62,6 +62,8 @@ #include #include +#include + #include #include @@ -76,7 +78,7 @@ static void tcp_v6_reqsk_send_ack(struct static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static const struct inet_connection_sock_af_ops ipv6_mapped; +const struct inet_connection_sock_af_ops ipv6_mapped; static const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; @@ -893,6 +895,7 @@ struct request_sock_ops tcp6_request_soc .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset }; +EXPORT_SYMBOL(tcp6_request_sock_ops); #ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { @@ -1497,6 +1500,7 @@ static int tcp_v6_do_rcv(struct sock *sk struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp; struct sk_buff *opt_skb = NULL; + struct user_beancounter *ub; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. @@ -1509,6 +1513,8 @@ static int tcp_v6_do_rcv(struct sock *sk if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + ub = set_exec_ub(sock_bc(sk)->ub); + #ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash (sk, skb)) goto discard; @@ -1545,7 +1551,7 @@ static int tcp_v6_do_rcv(struct sock *sk TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; - return 0; + goto restore_context; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) @@ -1566,7 +1572,7 @@ static int tcp_v6_do_rcv(struct sock *sk goto reset; if (opt_skb) __kfree_skb(opt_skb); - return 0; + goto restore_context; } } @@ -1576,6 +1582,9 @@ static int tcp_v6_do_rcv(struct sock *sk TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; + +restore_context: + (void)set_exec_ub(ub); return 0; reset: @@ -1584,7 +1593,7 @@ discard: if (opt_skb) __kfree_skb(opt_skb); kfree_skb(skb); - return 0; + goto restore_context; csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1615,7 +1624,7 @@ ipv6_pktoptions: } kfree_skb(opt_skb); - return 0; + goto restore_context; } static int tcp_v6_rcv(struct sk_buff *skb) @@ -1794,7 +1803,7 @@ static const struct tcp_sock_af_ops tcp_ * TCP over IPv4 via INET6 API */ -static const struct inet_connection_sock_af_ops ipv6_mapped = { +const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1813,6 +1822,8 @@ static const struct inet_connection_sock #endif }; +EXPORT_SYMBOL_GPL(ipv6_mapped); + #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .md5_lookup = tcp_v4_md5_lookup, diff -urNp linux-2.6.32.48/net/key/af_key.c linux-2.6.32.48-openvz/net/key/af_key.c --- linux-2.6.32.48/net/key/af_key.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/key/af_key.c 2011-11-21 17:40:47.000000000 -0500 @@ -183,7 +183,7 @@ static int pfkey_create(struct net *net, struct sock *sk; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; diff -urNp linux-2.6.32.48/net/netfilter/core.c linux-2.6.32.48-openvz/net/netfilter/core.c --- linux-2.6.32.48/net/netfilter/core.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/core.c 2011-11-21 17:40:47.000000000 -0500 @@ -60,6 +60,8 @@ int nf_register_hook(struct nf_hook_ops struct nf_hook_ops *elem; int err; + BUG_ON(!ve_is_super(get_exec_env())); + err = mutex_lock_interruptible(&nf_hook_mutex); if (err < 0) return err; @@ -75,6 +77,8 @@ EXPORT_SYMBOL(nf_register_hook); void nf_unregister_hook(struct nf_hook_ops *reg) { + BUG_ON(!ve_is_super(get_exec_env())); + mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); diff -urNp linux-2.6.32.48/net/netfilter/ipvs/ip_vs_conn.c linux-2.6.32.48-openvz/net/netfilter/ipvs/ip_vs_conn.c --- linux-2.6.32.48/net/netfilter/ipvs/ip_vs_conn.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/ipvs/ip_vs_conn.c 2011-11-21 17:40:47.000000000 -0500 @@ -1074,7 +1074,7 @@ int __init ip_vs_conn_init(void) /* Allocate ip_vs_conn slab cache */ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (!ip_vs_conn_cachep) { vfree(ip_vs_conn_tab); return -ENOMEM; diff -urNp linux-2.6.32.48/net/netfilter/ipvs/ip_vs_sync.c linux-2.6.32.48-openvz/net/netfilter/ipvs/ip_vs_sync.c --- linux-2.6.32.48/net/netfilter/ipvs/ip_vs_sync.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/ipvs/ip_vs_sync.c 2011-11-21 17:40:47.000000000 -0500 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -490,7 +491,8 @@ static int set_mcast_if(struct sock *sk, struct net_device *dev; struct inet_sock *inet = inet_sk(sk); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -511,11 +513,12 @@ static int set_mcast_if(struct sock *sk, */ static int set_sync_mesg_maxlen(int sync_state) { + struct net *net = get_exec_env()->ve_netns; struct net_device *dev; int num; if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(net, ip_vs_master_mcast_ifn)) == NULL) return -ENODEV; num = (dev->mtu - sizeof(struct iphdr) - @@ -526,7 +529,7 @@ static int set_sync_mesg_maxlen(int sync IP_VS_DBG(7, "setting the maximum length of sync sending " "message %d.\n", sync_send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(net, ip_vs_backup_mcast_ifn)) == NULL) return -ENODEV; sync_recv_mesg_maxlen = dev->mtu - @@ -554,7 +557,8 @@ join_mcast_group(struct sock *sk, struct memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -575,7 +579,8 @@ static int bind_mcastif_addr(struct sock __be32 addr; struct sockaddr_in sin; - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); + if (!dev) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); diff -urNp linux-2.6.32.48/net/netfilter/nf_conntrack_core.c linux-2.6.32.48-openvz/net/netfilter/nf_conntrack_core.c --- linux-2.6.32.48/net/netfilter/nf_conntrack_core.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/nf_conntrack_core.c 2011-11-21 17:40:47.000000000 -0500 @@ -45,6 +45,9 @@ #include #include +#include +#include + #define NF_CONNTRACK_VERSION "0.5.0" int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, @@ -179,6 +182,11 @@ destroy_conntrack(struct nf_conntrack *n struct nf_conn *ct = (struct nf_conn *)nfct; struct net *net = nf_ct_net(ct); struct nf_conntrack_l4proto *l4proto; +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *old_ve; + + old_ve = set_exec_env(ct->ct_net->owner_ve); +#endif pr_debug("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); @@ -215,6 +223,9 @@ destroy_conntrack(struct nf_conntrack *n pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); nf_conntrack_free(ct); +#ifdef CONFIG_VE_IPTABLES + (void)set_exec_env(old); +#endif } void nf_ct_delete_from_lists(struct nf_conn *ct) @@ -538,9 +549,11 @@ static noinline int early_drop(struct ne struct nf_conn *nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, + struct user_beancounter *ub, gfp_t gfp) { struct nf_conn *ct; + struct user_beancounter *old_ub; if (unlikely(!nf_conntrack_hash_rnd_initted)) { get_random_bytes(&nf_conntrack_hash_rnd, @@ -568,7 +581,9 @@ struct nf_conn *nf_conntrack_alloc(struc * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_DESTROY_BY_RCU. */ + old_ub = set_exec_ub(ub); ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + (void)set_exec_ub(old_ub); if (ct == NULL) { pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); atomic_dec(&net->ct.count); @@ -625,13 +640,20 @@ init_conntrack(struct net *net, struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_expect *exp; + struct user_beancounter *ub = NULL; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { pr_debug("Can't invert tuple.\n"); return NULL; } - ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC); +#ifdef CONFIG_BEANCOUNTERS + if (skb->dev != NULL) /* received skb */ + ub = netdev_bc(skb->dev)->exec_ub; + else if (skb->sk != NULL) /* sent skb */ + ub = sock_bc(skb->sk)->ub; +#endif + ct = nf_conntrack_alloc(net, tuple, &repl_tuple, ub, GFP_ATOMIC); if (IS_ERR(ct)) { pr_debug("Can't allocate conntrack.\n"); return (struct nf_conntrack_tuple_hash *)ct; @@ -714,6 +736,8 @@ resolve_normal_ct(struct net *net, /* look for tuple match */ h = nf_conntrack_find_get(net, &tuple); if (!h) { + if (!mask_ipt_allow(get_exec_env()->ipt_mask, VE_NF_CONNTRACK)) + return NULL; h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff); if (!h) return NULL; @@ -1168,12 +1192,12 @@ void *nf_ct_alloc_hashtable(unsigned int BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); sz = nr_slots * sizeof(struct hlist_nulls_head); - hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + hash = (void *)__get_free_pages(GFP_KERNEL_UBC | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); if (!hash) { *vmalloced = 1; printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); - hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + hash = __vmalloc(sz, GFP_KERNEL_UBC | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); } diff -urNp linux-2.6.32.48/net/netfilter/nf_conntrack_expect.c linux-2.6.32.48-openvz/net/netfilter/nf_conntrack_expect.c --- linux-2.6.32.48/net/netfilter/nf_conntrack_expect.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/nf_conntrack_expect.c 2011-11-21 17:40:47.000000000 -0500 @@ -305,7 +305,7 @@ void nf_ct_expect_put(struct nf_conntrac } EXPORT_SYMBOL_GPL(nf_ct_expect_put); -static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) +void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); @@ -329,6 +329,7 @@ static void nf_ct_expect_insert(struct n atomic_inc(&exp->use); NF_CT_STAT_INC(net, expect_create); } +EXPORT_SYMBOL_GPL(nf_ct_expect_insert); /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct nf_conn *master, diff -urNp linux-2.6.32.48/net/netfilter/nf_conntrack_netlink.c linux-2.6.32.48-openvz/net/netfilter/nf_conntrack_netlink.c --- linux-2.6.32.48/net/netfilter/nf_conntrack_netlink.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/nf_conntrack_netlink.c 2011-11-21 17:40:47.000000000 -0500 @@ -46,6 +46,10 @@ #include #include +#include +#include +#include + MODULE_LICENSE("GPL"); static char __initdata version[] = "0.93"; @@ -1178,13 +1182,14 @@ static struct nf_conn * ctnetlink_create_conntrack(const struct nlattr * const cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, - u8 u3) + u8 u3, + struct user_beancounter *ub) { struct nf_conn *ct; int err = -EINVAL; struct nf_conntrack_helper *helper; - ct = nf_conntrack_alloc(&init_net, otuple, rtuple, GFP_ATOMIC); + ct = nf_conntrack_alloc(&init_net, otuple, rtuple, ub, GFP_ATOMIC); if (IS_ERR(ct)) return ERR_PTR(-ENOMEM); @@ -1342,9 +1347,14 @@ ctnetlink_new_conntrack(struct sock *ctn if (nlh->nlmsg_flags & NLM_F_CREATE) { struct nf_conn *ct; enum ip_conntrack_events events; + struct user_beancounter *ub = NULL; +#ifdef CONFIG_BEANCOUNTERS + if (skb->sk) + ub = sock_bc(skb->sk)->ub; +#endif ct = ctnetlink_create_conntrack(cda, &otuple, - &rtuple, u3); + &rtuple, u3, ub); if (IS_ERR(ct)) { err = PTR_ERR(ct); goto out_unlock; diff -urNp linux-2.6.32.48/net/netfilter/nf_conntrack_standalone.c linux-2.6.32.48-openvz/net/netfilter/nf_conntrack_standalone.c --- linux-2.6.32.48/net/netfilter/nf_conntrack_standalone.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/nf_conntrack_standalone.c 2011-11-21 17:40:47.000000000 -0500 @@ -29,6 +29,10 @@ MODULE_LICENSE("GPL"); +int ip_conntrack_disable_ve0 = 0; +module_param(ip_conntrack_disable_ve0, int, 0440); +EXPORT_SYMBOL(ip_conntrack_disable_ve0); + #ifdef CONFIG_PROC_FS int print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, diff -urNp linux-2.6.32.48/net/netfilter/nfnetlink.c linux-2.6.32.48-openvz/net/netfilter/nfnetlink.c --- linux-2.6.32.48/net/netfilter/nfnetlink.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/nfnetlink.c 2011-11-21 17:40:47.000000000 -0500 @@ -133,7 +133,7 @@ static int nfnetlink_rcv_msg(struct sk_b const struct nfnetlink_subsystem *ss; int type, err; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; /* All the messages must at least contain nfgenmsg */ diff -urNp linux-2.6.32.48/net/netfilter/nfnetlink_queue.c linux-2.6.32.48-openvz/net/netfilter/nfnetlink_queue.c --- linux-2.6.32.48/net/netfilter/nfnetlink_queue.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/nfnetlink_queue.c 2011-11-21 17:40:47.000000000 -0500 @@ -555,9 +555,6 @@ nfqnl_rcv_dev_event(struct notifier_bloc { struct net_device *dev = ptr; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev->ifindex); @@ -586,8 +583,7 @@ nfqnl_rcv_nl_event(struct notifier_block struct hlist_head *head = &instance_table[i]; hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - if ((n->net == &init_net) && - (n->pid == inst->peer_pid)) + if (n->pid == inst->peer_pid) __instance_destroy(inst); } } diff -urNp linux-2.6.32.48/net/netfilter/x_tables.c linux-2.6.32.48-openvz/net/netfilter/x_tables.c --- linux-2.6.32.48/net/netfilter/x_tables.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/x_tables.c 2011-11-21 17:40:47.000000000 -0500 @@ -24,6 +24,8 @@ #include #include +#include + #include #include @@ -66,6 +68,46 @@ static const char *const xt_prefix[NFPRO [NFPROTO_IPV6] = "ip6", }; +#ifdef CONFIG_BEANCOUNTERS +static inline struct user_beancounter *xt_table_ub(struct xt_table_info *info) +{ + struct user_beancounter *ub; + + for (ub = mem_ub(info); ub->parent != NULL; ub = ub->parent); + return ub; +} + +static void uncharge_xtables(struct xt_table_info *info, unsigned long size) +{ + struct user_beancounter *ub; + + ub = xt_table_ub(info); + uncharge_beancounter(ub, UB_NUMXTENT, size); +} + +static int recharge_xtables(int check_ub, + struct xt_table_info *new, struct xt_table_info *old) +{ + struct user_beancounter *ub; + long change; + + ub = xt_table_ub(new); + BUG_ON(check_ub && ub != xt_table_ub(old)); + + change = (long)new->number - (long)old->number; + if (change > 0) { + if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT)) + return -ENOMEM; + } else if (change < 0) + uncharge_beancounter(ub, UB_NUMXTENT, -change); + + return 0; +} +#else +#define recharge_xtables(c, new, old) (0) +#define uncharge_xtables(info, s) do { } while (0) +#endif /* CONFIG_BEANCOUNTERS */ + /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) @@ -364,14 +406,14 @@ int xt_check_match(struct xt_mtchk_param * ebt_among is exempt from centralized matchsize checking * because it uses a dynamic-size data set. */ - pr_err("%s_tables: %s match: invalid size %Zu != %u\n", + ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: invalid size %Zu != %u\n", xt_prefix[par->family], par->match->name, XT_ALIGN(par->match->matchsize), size); return -EINVAL; } if (par->match->table != NULL && strcmp(par->match->table, par->table) != 0) { - pr_err("%s_tables: %s match: only valid in %s table, not %s\n", + ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: only valid in %s table, not %s\n", xt_prefix[par->family], par->match->name, par->match->table, par->table); return -EINVAL; @@ -379,7 +421,7 @@ int xt_check_match(struct xt_mtchk_param if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { char used[64], allow[64]; - pr_err("%s_tables: %s match: used from hooks %s, but only " + ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: used from hooks %s, but only " "valid from %s\n", xt_prefix[par->family], par->match->name, textify_hooks(used, sizeof(used), par->hook_mask), @@ -387,7 +429,7 @@ int xt_check_match(struct xt_mtchk_param return -EINVAL; } if (par->match->proto && (par->match->proto != proto || inv_proto)) { - pr_err("%s_tables: %s match: only valid for protocol %u\n", + ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: only valid for protocol %u\n", xt_prefix[par->family], par->match->name, par->match->proto); return -EINVAL; @@ -620,19 +662,19 @@ struct xt_table_info *xt_alloc_table_inf if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); + newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL_UBC); if (!newinfo) return NULL; - newinfo->size = size; + newinfo->alloc_size = newinfo->size = size; for_each_possible_cpu(cpu) { if (size <= PAGE_SIZE) newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, + GFP_KERNEL_UBC, cpu_to_node(cpu)); else - newinfo->entries[cpu] = vmalloc_node(size, + newinfo->entries[cpu] = ub_vmalloc_node(size, cpu_to_node(cpu)); if (newinfo->entries[cpu] == NULL) { @@ -650,7 +692,7 @@ void xt_free_table_info(struct xt_table_ int cpu; for_each_possible_cpu(cpu) { - if (info->size <= PAGE_SIZE) + if (info->alloc_size <= PAGE_SIZE) kfree(info->entries[cpu]); else vfree(info->entries[cpu]); @@ -721,6 +763,12 @@ xt_replace_table(struct xt_table *table, return NULL; } + if (recharge_xtables(num_counters != 0, newinfo, private)) { + local_bh_enable(); + *error = -ENOMEM; + return NULL; + } + table->private = newinfo; newinfo->initial_entries = private->initial_entries; @@ -798,6 +846,7 @@ void *xt_unregister_table(struct xt_tabl list_del(&table->list); mutex_unlock(&xt[table->af].mutex); kfree(table); + uncharge_xtables(private, private->number); return private; } diff -urNp linux-2.6.32.48/net/netfilter/xt_connmark.c linux-2.6.32.48-openvz/net/netfilter/xt_connmark.c --- linux-2.6.32.48/net/netfilter/xt_connmark.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_connmark.c 2011-11-21 17:40:47.000000000 -0500 @@ -47,6 +47,36 @@ connmark_mt(const struct sk_buff *skb, c return ((ct->mark & info->mask) == info->mark) ^ info->invert; } +static bool +connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct xt_connmark_info *info = par->matchinfo; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static bool connmark_mt_check_v0(const struct xt_mtchk_param *par) +{ + const struct xt_connmark_info *cm = par->matchinfo; + + if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { + printk(KERN_WARNING "connmark: only support 32bit mark\n"); + return false; + } + if (nf_ct_l3proto_try_module_get(par->family) < 0) { + printk(KERN_WARNING "can't load conntrack support for " + "proto=%u\n", par->family); + return false; + } + return true; +} + static bool connmark_mt_check(const struct xt_mtchk_param *par) { if (nf_ct_l3proto_try_module_get(par->family) < 0) { @@ -62,25 +92,74 @@ static void connmark_mt_destroy(const st nf_ct_l3proto_module_put(par->family); } -static struct xt_match connmark_mt_reg __read_mostly = { - .name = "connmark", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connmark_mt_check, - .match = connmark_mt, - .matchsize = sizeof(struct xt_connmark_mtinfo1), - .destroy = connmark_mt_destroy, - .me = THIS_MODULE, +#ifdef CONFIG_COMPAT +struct compat_xt_connmark_info { + compat_ulong_t mark, mask; + u_int8_t invert; + u_int8_t __pad1; + u_int16_t __pad2; +}; + +static void connmark_mt_compat_from_user_v0(void *dst, void *src) +{ + const struct compat_xt_connmark_info *cm = src; + struct xt_connmark_info m = { + .mark = cm->mark, + .mask = cm->mask, + .invert = cm->invert, + }; + memcpy(dst, &m, sizeof(m)); +} + +static int connmark_mt_compat_to_user_v0(void __user *dst, void *src) +{ + const struct xt_connmark_info *m = src; + struct compat_xt_connmark_info cm = { + .mark = m->mark, + .mask = m->mask, + .invert = m->invert, + }; + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + +static struct xt_match connmark_mt_reg[] __read_mostly = { + { + .name = "connmark", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_mt_check_v0, + .match = connmark_mt_v0, + .destroy = connmark_mt_destroy, + .matchsize = sizeof(struct xt_connmark_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_connmark_info), + .compat_from_user = connmark_mt_compat_from_user_v0, + .compat_to_user = connmark_mt_compat_to_user_v0, +#endif + .me = THIS_MODULE + }, + { + .name = "connmark", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_mt_check, + .match = connmark_mt, + .matchsize = sizeof(struct xt_connmark_mtinfo1), + .destroy = connmark_mt_destroy, + .me = THIS_MODULE, + }, }; static int __init connmark_mt_init(void) { - return xt_register_match(&connmark_mt_reg); + return xt_register_matches(connmark_mt_reg, + ARRAY_SIZE(connmark_mt_reg)); } static void __exit connmark_mt_exit(void) { - xt_unregister_match(&connmark_mt_reg); + xt_unregister_matches(connmark_mt_reg, ARRAY_SIZE(connmark_mt_reg)); } module_init(connmark_mt_init); diff -urNp linux-2.6.32.48/net/netfilter/xt_CONNMARK.c linux-2.6.32.48-openvz/net/netfilter/xt_CONNMARK.c --- linux-2.6.32.48/net/netfilter/xt_CONNMARK.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_CONNMARK.c 2011-11-21 17:40:47.000000000 -0500 @@ -36,6 +36,45 @@ MODULE_ALIAS("ip6t_CONNMARK"); #include static unsigned int +connmark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) +{ + const struct xt_connmark_target_info *markinfo = par->targinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + u_int32_t diff; + u_int32_t mark; + u_int32_t newmark; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + switch(markinfo->mode) { + case XT_CONNMARK_SET: + newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; + if (newmark != ct->mark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; + case XT_CONNMARK_SAVE: + newmark = (ct->mark & ~markinfo->mask) | + (skb->mark & markinfo->mask); + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; + case XT_CONNMARK_RESTORE: + mark = skb->mark; + diff = (ct->mark ^ mark) & markinfo->mask; + skb->mark = mark ^ diff; + break; + } + } + + return XT_CONTINUE; +} + +static unsigned int connmark_tg(struct sk_buff *skb, const struct xt_target_param *par) { const struct xt_connmark_tginfo1 *info = par->targinfo; @@ -73,6 +112,30 @@ connmark_tg(struct sk_buff *skb, const s return XT_CONTINUE; } +static bool connmark_tg_check_v0(const struct xt_tgchk_param *par) +{ + const struct xt_connmark_target_info *matchinfo = par->targinfo; + + if (matchinfo->mode == XT_CONNMARK_RESTORE) { + if (strcmp(par->table, "mangle") != 0) { + printk(KERN_WARNING "CONNMARK: restore can only be " + "called from \"mangle\" table, not \"%s\"\n", + par->table); + return false; + } + } + if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) { + printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); + return false; + } + if (nf_ct_l3proto_try_module_get(par->family) < 0) { + printk(KERN_WARNING "can't load conntrack support for " + "proto=%u\n", par->family); + return false; + } + return true; +} + static bool connmark_tg_check(const struct xt_tgchk_param *par) { if (nf_ct_l3proto_try_module_get(par->family) < 0) { @@ -88,25 +151,74 @@ static void connmark_tg_destroy(const st nf_ct_l3proto_module_put(par->family); } -static struct xt_target connmark_tg_reg __read_mostly = { - .name = "CONNMARK", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connmark_tg_check, - .target = connmark_tg, - .targetsize = sizeof(struct xt_connmark_tginfo1), - .destroy = connmark_tg_destroy, - .me = THIS_MODULE, +#ifdef CONFIG_COMPAT +struct compat_xt_connmark_target_info { + compat_ulong_t mark, mask; + u_int8_t mode; + u_int8_t __pad1; + u_int16_t __pad2; +}; + +static void connmark_tg_compat_from_user_v0(void *dst, void *src) +{ + const struct compat_xt_connmark_target_info *cm = src; + struct xt_connmark_target_info m = { + .mark = cm->mark, + .mask = cm->mask, + .mode = cm->mode, + }; + memcpy(dst, &m, sizeof(m)); +} + +static int connmark_tg_compat_to_user_v0(void __user *dst, void *src) +{ + const struct xt_connmark_target_info *m = src; + struct compat_xt_connmark_target_info cm = { + .mark = m->mark, + .mask = m->mask, + .mode = m->mode, + }; + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + +static struct xt_target connmark_tg_reg[] __read_mostly = { + { + .name = "CONNMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check_v0, + .destroy = connmark_tg_destroy, + .target = connmark_tg_v0, + .targetsize = sizeof(struct xt_connmark_target_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_connmark_target_info), + .compat_from_user = connmark_tg_compat_from_user_v0, + .compat_to_user = connmark_tg_compat_to_user_v0, +#endif + .me = THIS_MODULE + }, + { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, }; static int __init connmark_tg_init(void) { - return xt_register_target(&connmark_tg_reg); + return xt_register_targets(connmark_tg_reg, + ARRAY_SIZE(connmark_tg_reg)); } static void __exit connmark_tg_exit(void) { - xt_unregister_target(&connmark_tg_reg); + xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); } module_init(connmark_tg_init); diff -urNp linux-2.6.32.48/net/netfilter/xt_conntrack.c linux-2.6.32.48-openvz/net/netfilter/xt_conntrack.c --- linux-2.6.32.48/net/netfilter/xt_conntrack.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_conntrack.c 2011-11-21 17:40:47.000000000 -0500 @@ -25,6 +25,95 @@ MODULE_ALIAS("ipt_conntrack"); MODULE_ALIAS("ip6t_conntrack"); static bool +conntrack_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct xt_conntrack_info *sinfo = par->matchinfo; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + ct = nf_ct_get(skb, &ctinfo); + +#define FWINV(bool, invflg) ((bool) ^ !!(sinfo->invflags & (invflg))) + + if (ct == &nf_conntrack_untracked) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else if (ct) + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = XT_CONNTRACK_STATE_INVALID; + + if (sinfo->flags & XT_CONNTRACK_STATE) { + if (ct) { + if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) + statebit |= XT_CONNTRACK_STATE_SNAT; + if (test_bit(IPS_DST_NAT_BIT, &ct->status)) + statebit |= XT_CONNTRACK_STATE_DNAT; + } + if (FWINV((statebit & sinfo->statemask) == 0, + XT_CONNTRACK_STATE)) + return false; + } + + if (ct == NULL) { + if (sinfo->flags & ~XT_CONNTRACK_STATE) + return false; + return true; + } + + if (sinfo->flags & XT_CONNTRACK_PROTO && + FWINV(nf_ct_protonum(ct) != + sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, + XT_CONNTRACK_PROTO)) + return false; + + if (sinfo->flags & XT_CONNTRACK_ORIGSRC && + FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip & + sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != + sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, + XT_CONNTRACK_ORIGSRC)) + return false; + + if (sinfo->flags & XT_CONNTRACK_ORIGDST && + FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip & + sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != + sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, + XT_CONNTRACK_ORIGDST)) + return false; + + if (sinfo->flags & XT_CONNTRACK_REPLSRC && + FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip & + sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != + sinfo->tuple[IP_CT_DIR_REPLY].src.ip, + XT_CONNTRACK_REPLSRC)) + return false; + + if (sinfo->flags & XT_CONNTRACK_REPLDST && + FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip & + sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != + sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, + XT_CONNTRACK_REPLDST)) + return false; + + if (sinfo->flags & XT_CONNTRACK_STATUS && + FWINV((ct->status & sinfo->statusmask) == 0, + XT_CONNTRACK_STATUS)) + return false; + + if(sinfo->flags & XT_CONNTRACK_EXPIRES) { + unsigned long expires = timer_pending(&ct->timeout) ? + (ct->timeout.expires - jiffies)/HZ : 0; + + if (FWINV(!(expires >= sinfo->expires_min && + expires <= sinfo->expires_max), + XT_CONNTRACK_EXPIRES)) + return false; + } + return true; +#undef FWINV +} + +static bool conntrack_addrcmp(const union nf_inet_addr *kaddr, const union nf_inet_addr *uaddr, const union nf_inet_addr *umask, unsigned int l3proto) @@ -112,6 +201,55 @@ ct_proto_port_check(const struct xt_conn return true; } +#ifdef CONFIG_COMPAT +struct compat_xt_conntrack_info +{ + compat_uint_t statemask; + compat_uint_t statusmask; + struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX]; + struct in_addr sipmsk[IP_CT_DIR_MAX]; + struct in_addr dipmsk[IP_CT_DIR_MAX]; + compat_ulong_t expires_min; + compat_ulong_t expires_max; + u_int8_t flags; + u_int8_t invflags; +}; + +static void conntrack_mt_compat_from_user_v0(void *dst, void *src) +{ + const struct compat_xt_conntrack_info *cm = src; + struct xt_conntrack_info m = { + .statemask = cm->statemask, + .statusmask = cm->statusmask, + .expires_min = cm->expires_min, + .expires_max = cm->expires_max, + .flags = cm->flags, + .invflags = cm->invflags, + }; + memcpy(m.tuple, cm->tuple, sizeof(m.tuple)); + memcpy(m.sipmsk, cm->sipmsk, sizeof(m.sipmsk)); + memcpy(m.dipmsk, cm->dipmsk, sizeof(m.dipmsk)); + memcpy(dst, &m, sizeof(m)); +} + +static int conntrack_mt_compat_to_user_v0(void __user *dst, void *src) +{ + const struct xt_conntrack_info *m = src; + struct compat_xt_conntrack_info cm = { + .statemask = m->statemask, + .statusmask = m->statusmask, + .expires_min = m->expires_min, + .expires_max = m->expires_max, + .flags = m->flags, + .invflags = m->invflags, + }; + memcpy(cm.tuple, m->tuple, sizeof(cm.tuple)); + memcpy(cm.sipmsk, m->sipmsk, sizeof(cm.sipmsk)); + memcpy(cm.dipmsk, m->dipmsk, sizeof(cm.dipmsk)); + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} +#endif + static bool conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par, u16 state_mask, u16 status_mask) @@ -224,6 +362,21 @@ static void conntrack_mt_destroy(const s static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = conntrack_mt_v0, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .matchsize = sizeof(struct xt_conntrack_info), + .me = THIS_MODULE, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_conntrack_info), + .compat_from_user = conntrack_mt_compat_from_user_v0, + .compat_to_user = conntrack_mt_compat_to_user_v0, +#endif + }, + { + .name = "conntrack", .revision = 1, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo1), diff -urNp linux-2.6.32.48/net/netfilter/xt_dscp.c linux-2.6.32.48-openvz/net/netfilter/xt_dscp.c --- linux-2.6.32.48/net/netfilter/xt_dscp.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_dscp.c 2011-11-21 17:40:47.000000000 -0500 @@ -15,6 +15,7 @@ #include #include +#include MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Xtables: DSCP/TOS field match"); @@ -54,6 +55,14 @@ static bool dscp_mt_check(const struct x return true; } +static bool +tos_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct ipt_tos_info *info = par->matchinfo; + + return (ip_hdr(skb)->tos == info->tos) ^ info->invert; +} + static bool tos_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct xt_tos_match_info *info = par->matchinfo; @@ -85,6 +94,14 @@ static struct xt_match dscp_mt_reg[] __r }, { .name = "tos", + .revision = 0, + .family = NFPROTO_IPV4, + .match = tos_mt_v0, + .matchsize = sizeof(struct ipt_tos_info), + .me = THIS_MODULE, + }, + { + .name = "tos", .revision = 1, .family = NFPROTO_IPV4, .match = tos_mt, diff -urNp linux-2.6.32.48/net/netfilter/xt_DSCP.c linux-2.6.32.48-openvz/net/netfilter/xt_DSCP.c --- linux-2.6.32.48/net/netfilter/xt_DSCP.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_DSCP.c 2011-11-21 17:40:47.000000000 -0500 @@ -18,6 +18,7 @@ #include #include +#include MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification"); @@ -65,13 +66,48 @@ static bool dscp_tg_check(const struct x const struct xt_DSCP_info *info = par->targinfo; if (info->dscp > XT_DSCP_MAX) { - printk(KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp); + ve_printk(VE_LOG, KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp); return false; } return true; } static unsigned int +tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) +{ + const struct ipt_tos_target_info *info = par->targinfo; + struct iphdr *iph = ip_hdr(skb); + u_int8_t oldtos; + + if ((iph->tos & IPTOS_TOS_MASK) != info->tos) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + + iph = ip_hdr(skb); + oldtos = iph->tos; + iph->tos = (iph->tos & IPTOS_PREC_MASK) | info->tos; + csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); + } + + return XT_CONTINUE; +} + +static bool tos_tg_check_v0(const struct xt_tgchk_param *par) +{ + const struct ipt_tos_target_info *info = par->targinfo; + const uint8_t tos = info->tos; + + if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT && + tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST && + tos != IPTOS_NORMALSVC) { + printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); + return false; + } + + return true; +} + +static unsigned int tos_tg(struct sk_buff *skb, const struct xt_target_param *par) { const struct xt_tos_target_info *info = par->targinfo; @@ -132,6 +168,16 @@ static struct xt_target dscp_tg_reg[] __ }, { .name = "TOS", + .revision = 0, + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tos_tg_v0, + .targetsize = sizeof(struct ipt_tos_target_info), + .checkentry = tos_tg_check_v0, + .me = THIS_MODULE, + }, + { + .name = "TOS", .revision = 1, .family = NFPROTO_IPV4, .table = "mangle", diff -urNp linux-2.6.32.48/net/netfilter/xt_hashlimit.c linux-2.6.32.48-openvz/net/netfilter/xt_hashlimit.c --- linux-2.6.32.48/net/netfilter/xt_hashlimit.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_hashlimit.c 2011-11-21 17:40:47.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -41,8 +42,13 @@ MODULE_ALIAS("ipt_hashlimit"); MODULE_ALIAS("ip6t_hashlimit"); /* need to declare this at the top */ +#ifdef CONFIG_VE_IPTABLES +#define hashlimit_procdir4 (get_exec_env()->_xt_hashlimit->hashlimit_procdir4) +#define hashlimit_procdir6 (get_exec_env()->_xt_hashlimit->hashlimit_procdir6) +#else static struct proc_dir_entry *hashlimit_procdir4; static struct proc_dir_entry *hashlimit_procdir6; +#endif static const struct file_operations dl_file_ops; /* hash table crap */ @@ -99,9 +105,16 @@ struct xt_hashlimit_htable { static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ static DEFINE_MUTEX(hlimit_mutex); /* additional checkentry protection */ +#ifdef CONFIG_VE_IPTABLES +#define hashlimit_htables (get_exec_env()->_xt_hashlimit->hashlimit_htables) +#else static HLIST_HEAD(hashlimit_htables); +#endif static struct kmem_cache *hashlimit_cachep __read_mostly; +static int init_xt_hashlimit(void); +static void fini_xt_hashlimit(void); + static inline bool dst_cmp(const struct dsthash_ent *ent, const struct dsthash_dst *b) { @@ -687,6 +700,9 @@ static bool hashlimit_mt_check_v0(const if (r->name[sizeof(r->name) - 1] != '\0') return false; + if (init_xt_hashlimit()) + return 0; + /* This is the best we've got: We cannot release and re-grab lock, * since checkentry() is called before x_tables.c grabs xt_mutex. * We also cannot grab the hashtable spinlock, since htable_create will @@ -728,6 +744,9 @@ static bool hashlimit_mt_check(const str return false; } + if (init_xt_hashlimit()) + return 0; + /* This is the best we've got: We cannot release and re-grab lock, * since checkentry() is called before x_tables.c grabs xt_mutex. * We also cannot grab the hashtable spinlock, since htable_create will @@ -750,6 +769,8 @@ hashlimit_mt_destroy_v0(const struct xt_ const struct xt_hashlimit_info *r = par->matchinfo; htable_put(r->hinfo); + if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables)) + fini_xt_hashlimit(); } static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) @@ -757,6 +778,8 @@ static void hashlimit_mt_destroy(const s const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; htable_put(info->hinfo); + if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables)) + fini_xt_hashlimit(); } #ifdef CONFIG_COMPAT @@ -957,6 +980,78 @@ static const struct file_operations dl_f .release = seq_release }; +static inline struct proc_dir_entry *proc_from_netns(void) +{ +#if defined(CONFIG_VE) + return get_exec_env()->ve_netns->proc_net; +#else + return init_net.proc_net; +#endif +} + +static int init_xt_hashlimit(void) +{ + struct proc_dir_entry *proc_net = proc_from_netns(); + +#if defined(CONFIG_VE_IPTABLES) + struct ve_struct *ve = get_exec_env(); + + if (ve->_xt_hashlimit) + return 0; + + ve->_xt_hashlimit = kzalloc(sizeof(struct ve_xt_hashlimit), GFP_KERNEL); + if (!ve->_xt_hashlimit) + goto err1; +#endif + INIT_HLIST_HEAD(&hashlimit_htables); + + hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", proc_net); + if (!hashlimit_procdir4) { + printk(KERN_ERR "xt_hashlimit: unable to create proc dir " + "entry\n"); + goto err2; + } +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", proc_net); + if (!hashlimit_procdir6) { + printk(KERN_ERR "xt_hashlimit: unable to create proc dir " + "entry\n"); + goto err3; + } +#endif + + return 0; + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +err3: + remove_proc_entry("ipt_hashlimit", proc_net); +#endif +err2: +#if defined(CONFIG_VE_IPTABLES) + kfree(ve->_xt_hashlimit); + ve->_xt_hashlimit = NULL; +err1: +#endif + return -ENOMEM; +} + +static void fini_xt_hashlimit(void) +{ + struct proc_dir_entry *proc_net = proc_from_netns(); +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *ve = get_exec_env(); +#endif +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + remove_proc_entry("ip6t_hashlimit", proc_net); +#endif + remove_proc_entry("ipt_hashlimit", proc_net); + +#if defined(CONFIG_VE_IPTABLES) + kfree(ve->_xt_hashlimit); + ve->_xt_hashlimit = NULL; +#endif +} + static int __init hashlimit_mt_init(void) { int err; @@ -974,24 +1069,11 @@ static int __init hashlimit_mt_init(void printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n"); goto err2; } - hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", init_net.proc_net); - if (!hashlimit_procdir4) { - printk(KERN_ERR "xt_hashlimit: unable to create proc dir " - "entry\n"); + err = init_xt_hashlimit(); + if (err) goto err3; - } - err = 0; -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", init_net.proc_net); - if (!hashlimit_procdir6) { - printk(KERN_ERR "xt_hashlimit: unable to create proc dir " - "entry\n"); - err = -ENOMEM; - } -#endif if (!err) return 0; - remove_proc_entry("ipt_hashlimit", init_net.proc_net); err3: kmem_cache_destroy(hashlimit_cachep); err2: @@ -1003,10 +1085,7 @@ err1: static void __exit hashlimit_mt_exit(void) { - remove_proc_entry("ipt_hashlimit", init_net.proc_net); -#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - remove_proc_entry("ip6t_hashlimit", init_net.proc_net); -#endif + fini_xt_hashlimit(); kmem_cache_destroy(hashlimit_cachep); xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); } diff -urNp linux-2.6.32.48/net/netfilter/xt_iprange.c linux-2.6.32.48-openvz/net/netfilter/xt_iprange.c --- linux-2.6.32.48/net/netfilter/xt_iprange.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_iprange.c 2011-11-21 17:40:47.000000000 -0500 @@ -14,6 +14,40 @@ #include #include #include +#include + +static bool +iprange_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct ipt_iprange_info *info = par->matchinfo; + const struct iphdr *iph = ip_hdr(skb); + + if (info->flags & IPRANGE_SRC) { + if ((ntohl(iph->saddr) < ntohl(info->src.min_ip) + || ntohl(iph->saddr) > ntohl(info->src.max_ip)) + ^ !!(info->flags & IPRANGE_SRC_INV)) { + pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n", + &iph->saddr, + info->flags & IPRANGE_SRC_INV ? "(INV) " : "", + &info->src.min_ip, + &info->src.max_ip); + return false; + } + } + if (info->flags & IPRANGE_DST) { + if ((ntohl(iph->daddr) < ntohl(info->dst.min_ip) + || ntohl(iph->daddr) > ntohl(info->dst.max_ip)) + ^ !!(info->flags & IPRANGE_DST_INV)) { + pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n", + &iph->daddr, + info->flags & IPRANGE_DST_INV ? "(INV) " : "", + &info->dst.min_ip, + &info->dst.max_ip); + return false; + } + } + return true; +} static bool iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par) @@ -93,6 +127,14 @@ iprange_mt6(const struct sk_buff *skb, c static struct xt_match iprange_mt_reg[] __read_mostly = { { .name = "iprange", + .revision = 0, + .family = NFPROTO_IPV4, + .match = iprange_mt_v0, + .matchsize = sizeof(struct ipt_iprange_info), + .me = THIS_MODULE, + }, + { + .name = "iprange", .revision = 1, .family = NFPROTO_IPV4, .match = iprange_mt4, diff -urNp linux-2.6.32.48/net/netfilter/xt_limit.c linux-2.6.32.48-openvz/net/netfilter/xt_limit.c --- linux-2.6.32.48/net/netfilter/xt_limit.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_limit.c 2011-11-21 17:40:47.000000000 -0500 @@ -105,7 +105,7 @@ static bool limit_mt_check(const struct /* Check for overflow. */ if (r->burst == 0 || user2credits(r->avg * r->burst) < user2credits(r->avg)) { - printk("Overflow in xt_limit, try lower: %u/%u\n", + ve_printk(VE_LOG, "Overflow in xt_limit, try lower: %u/%u\n", r->avg, r->burst); return false; } diff -urNp linux-2.6.32.48/net/netfilter/xt_mark.c linux-2.6.32.48-openvz/net/netfilter/xt_mark.c --- linux-2.6.32.48/net/netfilter/xt_mark.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_mark.c 2011-11-21 17:40:47.000000000 -0500 @@ -23,6 +23,14 @@ MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); static bool +mark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct xt_mark_info *info = par->matchinfo; + + return ((skb->mark & info->mask) == info->mark) ^ info->invert; +} + +static bool mark_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct xt_mark_mtinfo1 *info = par->matchinfo; @@ -30,23 +38,81 @@ mark_mt(const struct sk_buff *skb, const return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -static struct xt_match mark_mt_reg __read_mostly = { - .name = "mark", - .revision = 1, - .family = NFPROTO_UNSPEC, - .match = mark_mt, - .matchsize = sizeof(struct xt_mark_mtinfo1), - .me = THIS_MODULE, +static bool mark_mt_check_v0(const struct xt_mtchk_param *par) +{ + const struct xt_mark_info *minfo = par->matchinfo; + + if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { + printk(KERN_WARNING "mark: only supports 32bit mark\n"); + return false; + } + return true; +} + +#ifdef CONFIG_COMPAT +struct compat_xt_mark_info { + compat_ulong_t mark, mask; + u_int8_t invert; + u_int8_t __pad1; + u_int16_t __pad2; +}; + +static void mark_mt_compat_from_user_v0(void *dst, void *src) +{ + const struct compat_xt_mark_info *cm = src; + struct xt_mark_info m = { + .mark = cm->mark, + .mask = cm->mask, + .invert = cm->invert, + }; + memcpy(dst, &m, sizeof(m)); +} + +static int mark_mt_compat_to_user_v0(void __user *dst, void *src) +{ + const struct xt_mark_info *m = src; + struct compat_xt_mark_info cm = { + .mark = m->mark, + .mask = m->mask, + .invert = m->invert, + }; + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + +static struct xt_match mark_mt_reg[] __read_mostly = { + { + .name = "mark", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = mark_mt_check_v0, + .match = mark_mt_v0, + .matchsize = sizeof(struct xt_mark_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_mark_info), + .compat_from_user = mark_mt_compat_from_user_v0, + .compat_to_user = mark_mt_compat_to_user_v0, +#endif + .me = THIS_MODULE, + }, + { + .name = "mark", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = mark_mt, + .matchsize = sizeof(struct xt_mark_mtinfo1), + .me = THIS_MODULE, + }, }; static int __init mark_mt_init(void) { - return xt_register_match(&mark_mt_reg); + return xt_register_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg)); } static void __exit mark_mt_exit(void) { - xt_unregister_match(&mark_mt_reg); + xt_unregister_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg)); } module_init(mark_mt_init); diff -urNp linux-2.6.32.48/net/netfilter/xt_MARK.c linux-2.6.32.48-openvz/net/netfilter/xt_MARK.c --- linux-2.6.32.48/net/netfilter/xt_MARK.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_MARK.c 2011-11-21 17:40:47.000000000 -0500 @@ -25,6 +25,39 @@ MODULE_ALIAS("ipt_MARK"); MODULE_ALIAS("ip6t_MARK"); static unsigned int +mark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) +{ + const struct xt_mark_target_info *markinfo = par->targinfo; + + skb->mark = markinfo->mark; + return XT_CONTINUE; +} + +static unsigned int +mark_tg_v1(struct sk_buff *skb, const struct xt_target_param *par) +{ + const struct xt_mark_target_info_v1 *markinfo = par->targinfo; + int mark = 0; + + switch (markinfo->mode) { + case XT_MARK_SET: + mark = markinfo->mark; + break; + + case XT_MARK_AND: + mark = skb->mark & markinfo->mark; + break; + + case XT_MARK_OR: + mark = skb->mark | markinfo->mark; + break; + } + + skb->mark = mark; + return XT_CONTINUE; +} + +static unsigned int mark_tg(struct sk_buff *skb, const struct xt_target_param *par) { const struct xt_mark_tginfo2 *info = par->targinfo; @@ -33,23 +66,135 @@ mark_tg(struct sk_buff *skb, const struc return XT_CONTINUE; } -static struct xt_target mark_tg_reg __read_mostly = { - .name = "MARK", - .revision = 2, - .family = NFPROTO_UNSPEC, - .target = mark_tg, - .targetsize = sizeof(struct xt_mark_tginfo2), - .me = THIS_MODULE, +static bool mark_tg_check_v0(const struct xt_tgchk_param *par) +{ + const struct xt_mark_target_info *markinfo = par->targinfo; + + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return false; + } + return true; +} + +static bool mark_tg_check_v1(const struct xt_tgchk_param *par) +{ + const struct xt_mark_target_info_v1 *markinfo = par->targinfo; + + if (markinfo->mode != XT_MARK_SET + && markinfo->mode != XT_MARK_AND + && markinfo->mode != XT_MARK_OR) { + printk(KERN_WARNING "MARK: unknown mode %u\n", + markinfo->mode); + return false; + } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return false; + } + return true; +} + +#ifdef CONFIG_COMPAT +struct compat_xt_mark_target_info { + compat_ulong_t mark; +}; + +static void mark_tg_compat_from_user_v0(void *dst, void *src) +{ + const struct compat_xt_mark_target_info *cm = src; + struct xt_mark_target_info m = { + .mark = cm->mark, + }; + memcpy(dst, &m, sizeof(m)); +} + +static int mark_tg_compat_to_user_v0(void __user *dst, void *src) +{ + const struct xt_mark_target_info *m = src; + struct compat_xt_mark_target_info cm = { + .mark = m->mark, + }; + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} + +struct compat_xt_mark_target_info_v1 { + compat_ulong_t mark; + u_int8_t mode; + u_int8_t __pad1; + u_int16_t __pad2; +}; + +static void mark_tg_compat_from_user_v1(void *dst, void *src) +{ + const struct compat_xt_mark_target_info_v1 *cm = src; + struct xt_mark_target_info_v1 m = { + .mark = cm->mark, + .mode = cm->mode, + }; + memcpy(dst, &m, sizeof(m)); +} + +static int mark_tg_compat_to_user_v1(void __user *dst, void *src) +{ + const struct xt_mark_target_info_v1 *m = src; + struct compat_xt_mark_target_info_v1 cm = { + .mark = m->mark, + .mode = m->mode, + }; + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + +static struct xt_target mark_tg_reg[] __read_mostly = { + { + .name = "MARK", + .family = NFPROTO_UNSPEC, + .revision = 0, + .checkentry = mark_tg_check_v0, + .target = mark_tg_v0, + .targetsize = sizeof(struct xt_mark_target_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_mark_target_info), + .compat_from_user = mark_tg_compat_from_user_v0, + .compat_to_user = mark_tg_compat_to_user_v0, +#endif + .table = "mangle", + .me = THIS_MODULE, + }, + { + .name = "MARK", + .family = NFPROTO_UNSPEC, + .revision = 1, + .checkentry = mark_tg_check_v1, + .target = mark_tg_v1, + .targetsize = sizeof(struct xt_mark_target_info_v1), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_mark_target_info_v1), + .compat_from_user = mark_tg_compat_from_user_v1, + .compat_to_user = mark_tg_compat_to_user_v1, +#endif + .table = "mangle", + .me = THIS_MODULE, + }, + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_UNSPEC, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, }; static int __init mark_tg_init(void) { - return xt_register_target(&mark_tg_reg); + return xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); } static void __exit mark_tg_exit(void) { - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); } module_init(mark_tg_init); diff -urNp linux-2.6.32.48/net/netfilter/xt_owner.c linux-2.6.32.48-openvz/net/netfilter/xt_owner.c --- linux-2.6.32.48/net/netfilter/xt_owner.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_owner.c 2011-11-21 17:40:47.000000000 -0500 @@ -16,6 +16,60 @@ #include #include #include +#include +#include + +static bool +owner_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct ipt_owner_info *info = par->matchinfo; + const struct file *filp; + + if (skb->sk == NULL || skb->sk->sk_socket == NULL) + return false; + + filp = skb->sk->sk_socket->file; + if (filp == NULL) + return false; + + if (info->match & IPT_OWNER_UID) + if ((filp->f_cred->fsuid != info->uid) ^ + !!(info->invert & IPT_OWNER_UID)) + return false; + + if (info->match & IPT_OWNER_GID) + if ((filp->f_cred->fsgid != info->gid) ^ + !!(info->invert & IPT_OWNER_GID)) + return false; + + return true; +} + +static bool +owner_mt6_v0(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct ip6t_owner_info *info = par->matchinfo; + const struct file *filp; + + if (skb->sk == NULL || skb->sk->sk_socket == NULL) + return false; + + filp = skb->sk->sk_socket->file; + if (filp == NULL) + return false; + + if (info->match & IP6T_OWNER_UID) + if ((filp->f_cred->fsuid != info->uid) ^ + !!(info->invert & IP6T_OWNER_UID)) + return false; + + if (info->match & IP6T_OWNER_GID) + if ((filp->f_cred->fsgid != info->gid) ^ + !!(info->invert & IP6T_OWNER_GID)) + return false; + + return true; +} static bool owner_mt(const struct sk_buff *skb, const struct xt_match_param *par) @@ -52,25 +106,76 @@ owner_mt(const struct sk_buff *skb, cons return true; } -static struct xt_match owner_mt_reg __read_mostly = { - .name = "owner", - .revision = 1, - .family = NFPROTO_UNSPEC, - .match = owner_mt, - .matchsize = sizeof(struct xt_owner_match_info), - .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING), - .me = THIS_MODULE, +static bool owner_mt_check_v0(const struct xt_mtchk_param *par) +{ + const struct ipt_owner_info *info = par->matchinfo; + + if (info->match & (IPT_OWNER_PID | IPT_OWNER_SID | IPT_OWNER_COMM)) { + printk(KERN_WARNING KBUILD_MODNAME + ": PID, SID and command matching is not " + "supported anymore\n"); + return false; + } + + return true; +} + +static bool owner_mt6_check_v0(const struct xt_mtchk_param *par) +{ + const struct ip6t_owner_info *info = par->matchinfo; + + if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) { + printk(KERN_WARNING KBUILD_MODNAME + ": PID and SID matching is not supported anymore\n"); + return false; + } + + return true; +} + +static struct xt_match owner_mt_reg[] __read_mostly = { + { + .name = "owner", + .revision = 0, + .family = NFPROTO_IPV4, + .match = owner_mt_v0, + .matchsize = sizeof(struct ipt_owner_info), + .checkentry = owner_mt_check_v0, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, + { + .name = "owner", + .revision = 0, + .family = NFPROTO_IPV6, + .match = owner_mt6_v0, + .matchsize = sizeof(struct ip6t_owner_info), + .checkentry = owner_mt6_check_v0, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, + { + .name = "owner", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = owner_mt, + .matchsize = sizeof(struct xt_owner_match_info), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, }; static int __init owner_mt_init(void) { - return xt_register_match(&owner_mt_reg); + return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg)); } static void __exit owner_mt_exit(void) { - xt_unregister_match(&owner_mt_reg); + xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg)); } module_init(owner_mt_init); diff -urNp linux-2.6.32.48/net/netfilter/xt_recent.c linux-2.6.32.48-openvz/net/netfilter/xt_recent.c --- linux-2.6.32.48/net/netfilter/xt_recent.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_recent.c 2011-11-21 17:40:47.000000000 -0500 @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -58,6 +60,9 @@ MODULE_PARM_DESC(ip_list_perms, "permiss MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/xt_recent/* files"); MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/xt_recent/* files"); +static int init_ipt_recent(struct ve_struct *ve); +static void fini_ipt_recent(struct ve_struct *ve); + struct recent_entry { struct list_head list; struct list_head lru_list; @@ -78,15 +83,27 @@ struct recent_table { struct list_head iphash[0]; }; +#if defined(CONFIG_VE_IPTABLES) +#define tables (get_exec_env()->_ipt_recent->tables) +#else static LIST_HEAD(tables); +#endif static DEFINE_SPINLOCK(recent_lock); static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS #ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT +#if defined(CONFIG_VE_IPTABLES) +#define proc_old_dir (get_exec_env()->_ipt_recent->proc_old_dir) +#else static struct proc_dir_entry *proc_old_dir; #endif +#endif +#if defined(CONFIG_VE_IPTABLES) +#define recent_proc_dir (get_exec_env()->_ipt_recent->proc_dir) +#else static struct proc_dir_entry *recent_proc_dir; +#endif static const struct file_operations recent_old_fops, recent_mt_fops; #endif @@ -300,6 +317,9 @@ static bool recent_mt_check(const struct strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) return false; + if (init_ipt_recent(get_exec_env())) + return 0; + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (t != NULL) { @@ -351,6 +371,13 @@ static void recent_mt_destroy(const stru { const struct xt_recent_mtinfo *info = par->matchinfo; struct recent_table *t; + struct ve_struct *ve; + + ve = get_exec_env(); +#ifdef CONFIG_VE_IPTABLES + if (!ve->_ipt_recent) + return; +#endif mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); @@ -368,6 +395,8 @@ static void recent_mt_destroy(const stru kfree(t); } mutex_unlock(&recent_mutex); + if (!ve_is_super(ve) && list_empty(&tables)) + fini_ipt_recent(ve); } #ifdef CONFIG_PROC_FS @@ -637,19 +666,26 @@ static struct xt_match recent_mt_reg[] _ }, }; -static int __init recent_mt_init(void) +static int init_ipt_recent(struct ve_struct *ve) { - int err; + int err = 0; - if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) - return -EINVAL; - ip_list_hash_size = 1 << fls(ip_list_tot); +#ifdef CONFIG_VE_IPTABLES + if (ve->_ipt_recent) + return 0; - err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + ve->_ipt_recent = kzalloc(sizeof(struct ve_ipt_recent), GFP_KERNEL); + if (!ve->_ipt_recent) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&tables); +#endif #ifdef CONFIG_PROC_FS if (err) return err; - recent_proc_dir = proc_mkdir("xt_recent", init_net.proc_net); + recent_proc_dir = proc_mkdir("xt_recent", ve->ve_netns->proc_net); if (recent_proc_dir == NULL) { xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); err = -ENOMEM; @@ -657,7 +693,7 @@ static int __init recent_mt_init(void) #ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT if (err < 0) return err; - proc_old_dir = proc_mkdir("ipt_recent", init_net.proc_net); + proc_old_dir = proc_mkdir("ipt_recent", ve->ve_netns->proc_net); if (proc_old_dir == NULL) { remove_proc_entry("xt_recent", init_net.proc_net); xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); @@ -665,20 +701,52 @@ static int __init recent_mt_init(void) } #endif #endif +out: return err; +out_mem: +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_ipt_recent); +#endif + goto out; } -static void __exit recent_mt_exit(void) +static void fini_ipt_recent(struct ve_struct *ve) { - BUG_ON(!list_empty(&tables)); - xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); #ifdef CONFIG_PROC_FS #ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT - remove_proc_entry("ipt_recent", init_net.proc_net); + remove_proc_entry("ipt_recent", ve->ve_netns->proc_net); +#endif + remove_proc_entry("xt_recent", ve->ve_netns->proc_net); #endif - remove_proc_entry("xt_recent", init_net.proc_net); +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_ipt_recent); + ve->_ipt_recent = NULL; #endif } +static int __init recent_mt_init(void) +{ + int err; + + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); + + err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + if (err) + return err; + err = init_ipt_recent(&ve0); + if (err) + xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + return err; +} + +static void __exit recent_mt_exit(void) +{ + BUG_ON(!list_empty(&tables)); + xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + fini_ipt_recent(&ve0); +} + module_init(recent_mt_init); module_exit(recent_mt_exit); diff -urNp linux-2.6.32.48/net/netfilter/xt_TCPMSS.c linux-2.6.32.48-openvz/net/netfilter/xt_TCPMSS.c --- linux-2.6.32.48/net/netfilter/xt_TCPMSS.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netfilter/xt_TCPMSS.c 2011-11-21 17:40:47.000000000 -0500 @@ -67,7 +67,7 @@ tcpmss_mangle_packet(struct sk_buff *skb badly. --RR */ if (tcplen != tcph->doff*4) { if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", + ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", skb->len); return -1; } @@ -75,14 +75,14 @@ tcpmss_mangle_packet(struct sk_buff *skb if (info->mss == XT_TCPMSS_CLAMP_PMTU) { if (dst_mtu(skb_dst(skb)) <= minlen) { if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: " + ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: " "unknown or invalid path-MTU (%u)\n", dst_mtu(skb_dst(skb))); return -1; } if (in_mtu <= minlen) { if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: unknown or " + ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: unknown or " "invalid path-MTU (%u)\n", in_mtu); return -1; } @@ -246,13 +246,13 @@ static bool tcpmss_tg4_check(const struc (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { - printk("xt_TCPMSS: path-MTU clamping only supported in " + ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in " "FORWARD, OUTPUT and POSTROUTING hooks\n"); return false; } if (IPT_MATCH_ITERATE(e, find_syn_match)) return true; - printk("xt_TCPMSS: Only works on TCP SYN packets\n"); + ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n"); return false; } @@ -266,13 +266,13 @@ static bool tcpmss_tg6_check(const struc (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { - printk("xt_TCPMSS: path-MTU clamping only supported in " + ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in " "FORWARD, OUTPUT and POSTROUTING hooks\n"); return false; } if (IP6T_MATCH_ITERATE(e, find_syn_match)) return true; - printk("xt_TCPMSS: Only works on TCP SYN packets\n"); + ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n"); return false; } #endif diff -urNp linux-2.6.32.48/net/netlink/af_netlink.c linux-2.6.32.48-openvz/net/netlink/af_netlink.c --- linux-2.6.32.48/net/netlink/af_netlink.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netlink/af_netlink.c 2011-11-21 17:40:47.000000000 -0500 @@ -60,29 +60,14 @@ #include #include #include +#include + +#include +#include #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) -struct netlink_sock { - /* struct sock has to be the first member of netlink_sock */ - struct sock sk; - u32 pid; - u32 dst_pid; - u32 dst_group; - u32 flags; - u32 subscriptions; - u32 ngroups; - unsigned long *groups; - unsigned long state; - wait_queue_head_t wait; - struct netlink_callback *cb; - struct mutex *cb_mutex; - struct mutex cb_def_mutex; - void (*netlink_rcv)(struct sk_buff *skb); - struct module *module; -}; - struct listeners_rcu_head { struct rcu_head rcu_head; void *ptr; @@ -411,6 +396,8 @@ static int __netlink_create(struct net * sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); if (!sk) return -ENOMEM; + if (ub_other_sock_charge(sk)) + goto out_free; sock_init_data(sock, sk); @@ -426,6 +413,10 @@ static int __netlink_create(struct net * sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; + +out_free: + sk_free(sk); + return -ENOMEM; } static int netlink_create(struct net *net, struct socket *sock, int protocol) @@ -539,7 +530,7 @@ static int netlink_autobind(struct socke struct hlist_head *head; struct sock *osk; struct hlist_node *node; - s32 pid = current->tgid; + s32 pid = task_tgid_vnr(current); int err; static s32 rover = -4097; @@ -575,7 +566,7 @@ retry: static inline int netlink_capable(struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || - capable(CAP_NET_ADMIN); + capable(CAP_VE_NET_ADMIN); } static void @@ -785,12 +776,20 @@ int netlink_attachskb(struct sock *sk, s long *timeo, struct sock *ssk) { struct netlink_sock *nlk; + unsigned long chargesize; + int no_ubc; nlk = nlk_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + chargesize = skb_charge_fullsize(skb); + no_ubc = ub_sock_getwres_other(sk, chargesize); + if (no_ubc || atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) { DECLARE_WAITQUEUE(wait, current); + + if (!no_ubc) + ub_sock_retwres_other(sk, chargesize, + SOCK_MIN_UBCSPACE_CH); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); @@ -802,13 +801,20 @@ int netlink_attachskb(struct sock *sk, s __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); + /* this if can't be moved upper because ub_sock_snd_queue_add() + * may change task state to TASK_RUNNING */ + if (no_ubc) + ub_sock_sndqueueadd_other(sk, chargesize); + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && + test_bit(0, &nlk->state) || no_ubc) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); + if (no_ubc) + ub_sock_sndqueuedel(sk); sock_put(sk); if (signal_pending(current)) { @@ -818,6 +824,7 @@ int netlink_attachskb(struct sock *sk, s return 1; } skb_set_owner_r(skb, sk); + ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); return 0; } @@ -984,8 +991,13 @@ static inline int do_one_broadcast(struc !test_bit(p->group - 1, nlk->groups)) goto out; + if (!ve_accessible_strict(get_exec_env(), sk->owner_env)) + goto out; + +#ifndef CONFIG_VE if (!net_eq(sock_net(sk), p->net)) goto out; +#endif if (p->failure) { netlink_overrun(sk); @@ -1649,6 +1661,10 @@ static int netlink_dump(struct sock *sk) skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); if (!skb) goto errout; + if (ub_nlrcvbuf_charge(skb, sk) < 0) { + kfree_skb(skb); + return -EACCES; + } mutex_lock(nlk->cb_mutex); diff -urNp linux-2.6.32.48/net/netlink/genetlink.c linux-2.6.32.48-openvz/net/netlink/genetlink.c --- linux-2.6.32.48/net/netlink/genetlink.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/netlink/genetlink.c 2011-11-21 17:40:47.000000000 -0500 @@ -519,7 +519,7 @@ static int genl_rcv_msg(struct sk_buff * return -EOPNOTSUPP; if ((ops->flags & GENL_ADMIN_PERM) && - security_netlink_recv(skb, CAP_NET_ADMIN)) + security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if (nlh->nlmsg_flags & NLM_F_DUMP) { diff -urNp linux-2.6.32.48/net/packet/af_packet.c linux-2.6.32.48-openvz/net/packet/af_packet.c --- linux-2.6.32.48/net/packet/af_packet.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/packet/af_packet.c 2011-11-21 17:40:47.000000000 -0500 @@ -80,6 +80,8 @@ #include #include +#include + #ifdef CONFIG_INET #include #endif @@ -554,6 +556,8 @@ static int packet_rcv(struct sk_buff *sk if (dev_net(dev) != sock_net(sk)) goto drop; + skb_orphan(skb); + skb->dev = dev; if (dev->header_ops) { @@ -617,6 +621,9 @@ static int packet_rcv(struct sk_buff *sk if (pskb_trim(skb, snaplen)) goto drop_n_acct; + if (ub_sockrcvbuf_charge(sk, skb)) + goto drop_n_acct; + skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); @@ -676,6 +683,8 @@ static int tpacket_rcv(struct sk_buff *s if (dev_net(dev) != sock_net(sk)) goto drop; + skb_orphan(skb); + if (dev->header_ops) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); @@ -725,6 +734,12 @@ static int tpacket_rcv(struct sk_buff *s snaplen = 0; } + if (copy_skb && + ub_sockrcvbuf_charge(sk, copy_skb)) { + spin_lock(&sk->sk_receive_queue.lock); + goto ring_is_full; + } + spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); if (!h.raw) @@ -1370,6 +1385,8 @@ static int packet_create(struct net *net sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); if (sk == NULL) goto out; + if (ub_other_sock_charge(sk)) + goto out_free; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) @@ -1409,6 +1426,9 @@ static int packet_create(struct net *net sock_prot_inuse_add(net, &packet_proto, 1); write_unlock_bh(&net->packet.sklist_lock); return 0; + +out_free: + sk_free(sk); out: return err; } diff -urNp linux-2.6.32.48/net/sched/act_api.c linux-2.6.32.48-openvz/net/sched/act_api.c --- linux-2.6.32.48/net/sched/act_api.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/act_api.c 2011-11-21 17:40:47.000000000 -0500 @@ -666,7 +666,8 @@ nlmsg_failure: } static int -act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event) +act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n, + struct tc_action *a, int event) { struct sk_buff *skb; @@ -678,7 +679,7 @@ act_get_notify(u32 pid, struct nlmsghdr return -EINVAL; } - return rtnl_unicast(skb, &init_net, pid); + return rtnl_unicast(skb, net, pid); } static struct tc_action * @@ -748,7 +749,8 @@ static struct tc_action *create_a(int i) return act; } -static int tca_action_flush(struct nlattr *nla, struct nlmsghdr *n, u32 pid) +static int tca_action_flush(struct net *net, struct nlattr *nla, + struct nlmsghdr *n, u32 pid) { struct sk_buff *skb; unsigned char *b; @@ -807,7 +809,7 @@ static int tca_action_flush(struct nlatt nlh->nlmsg_flags |= NLM_F_ROOT; module_put(a->ops->owner); kfree(a); - err = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); if (err > 0) return 0; @@ -824,7 +826,8 @@ noflush_out: } static int -tca_action_gd(struct nlattr *nla, struct nlmsghdr *n, u32 pid, int event) +tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, + u32 pid, int event) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; @@ -836,7 +839,7 @@ tca_action_gd(struct nlattr *nla, struct if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { if (tb[1] != NULL) - return tca_action_flush(tb[1], n, pid); + return tca_action_flush(net, tb[1], n, pid); else return -EINVAL; } @@ -857,7 +860,7 @@ tca_action_gd(struct nlattr *nla, struct } if (event == RTM_GETACTION) - ret = act_get_notify(pid, n, head, event); + ret = act_get_notify(net, pid, n, head, event); else { /* delete */ struct sk_buff *skb; @@ -876,7 +879,7 @@ tca_action_gd(struct nlattr *nla, struct /* now do the delete */ tcf_action_destroy(head, 0); - ret = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, + ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); if (ret > 0) return 0; @@ -887,8 +890,8 @@ err: return ret; } -static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, - u16 flags) +static int tcf_add_notify(struct net *net, struct tc_action *a, + u32 pid, u32 seq, int event, u16 flags) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -921,7 +924,7 @@ static int tcf_add_notify(struct tc_acti nlh->nlmsg_len = skb_tail_pointer(skb) - b; NETLINK_CB(skb).dst_group = RTNLGRP_TC; - err = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, flags&NLM_F_ECHO); + err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags&NLM_F_ECHO); if (err > 0) err = 0; return err; @@ -934,7 +937,8 @@ nlmsg_failure: static int -tcf_action_add(struct nlattr *nla, struct nlmsghdr *n, u32 pid, int ovr) +tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, + u32 pid, int ovr) { int ret = 0; struct tc_action *act; @@ -952,7 +956,7 @@ tcf_action_add(struct nlattr *nla, struc /* dump then free all the actions after update; inserted policy * stays intact * */ - ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); + ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); for (a = act; a; a = act) { act = a->next; kfree(a); @@ -968,9 +972,6 @@ static int tc_ctl_action(struct sk_buff u32 pid = skb ? NETLINK_CB(skb).pid : 0; int ret = 0, ovr = 0; - if (net != &init_net) - return -EINVAL; - ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); if (ret < 0) return ret; @@ -993,15 +994,17 @@ static int tc_ctl_action(struct sk_buff if (n->nlmsg_flags&NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(tca[TCA_ACT_TAB], n, pid, ovr); + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr); if (ret == -EAGAIN) goto replay; break; case RTM_DELACTION: - ret = tca_action_gd(tca[TCA_ACT_TAB], n, pid, RTM_DELACTION); + ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, + pid, RTM_DELACTION); break; case RTM_GETACTION: - ret = tca_action_gd(tca[TCA_ACT_TAB], n, pid, RTM_GETACTION); + ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, + pid, RTM_GETACTION); break; default: BUG(); @@ -1041,7 +1044,6 @@ find_dump_kind(const struct nlmsghdr *n) static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -1051,9 +1053,6 @@ tc_dump_action(struct sk_buff *skb, stru struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); struct nlattr *kind = find_dump_kind(cb->nlh); - if (net != &init_net) - return 0; - if (kind == NULL) { printk("tc_dump_action: action bad kind\n"); return 0; diff -urNp linux-2.6.32.48/net/sched/cls_api.c linux-2.6.32.48-openvz/net/sched/cls_api.c --- linux-2.6.32.48/net/sched/cls_api.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/cls_api.c 2011-11-21 17:40:47.000000000 -0500 @@ -98,8 +98,9 @@ out: } EXPORT_SYMBOL(unregister_tcf_proto_ops); -static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, - struct tcf_proto *tp, unsigned long fh, int event); +static int tfilter_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct tcf_proto *tp, + unsigned long fh, int event); /* Select new prio value from the range, managed by kernel. */ @@ -137,9 +138,6 @@ static int tc_ctl_tfilter(struct sk_buff int err; int tp_created = 0; - if (net != &init_net) - return -EINVAL; - replay: t = NLMSG_DATA(n); protocol = TC_H_MIN(t->tcm_info); @@ -158,7 +156,7 @@ replay: /* Find head of filter chain. */ /* Find link */ - dev = __dev_get_by_index(&init_net, t->tcm_ifindex); + dev = __dev_get_by_index(net, t->tcm_ifindex); if (dev == NULL) return -ENODEV; @@ -282,7 +280,7 @@ replay: *back = tp->next; spin_unlock_bh(root_lock); - tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); tcf_destroy(tp); err = 0; goto errout; @@ -305,10 +303,10 @@ replay: case RTM_DELTFILTER: err = tp->ops->delete(tp, fh); if (err == 0) - tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); goto errout; case RTM_GETTFILTER: - err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); goto errout; default: err = -EINVAL; @@ -324,7 +322,7 @@ replay: *back = tp; spin_unlock_bh(root_lock); } - tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); } else { if (tp_created) tcf_destroy(tp); @@ -370,8 +368,9 @@ nla_put_failure: return -1; } -static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, - struct tcf_proto *tp, unsigned long fh, int event) +static int tfilter_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct tcf_proto *tp, + unsigned long fh, int event) { struct sk_buff *skb; u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; @@ -385,7 +384,7 @@ static int tfilter_notify(struct sk_buff return -EINVAL; } - return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, + return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } @@ -417,12 +416,9 @@ static int tc_dump_tfilter(struct sk_buf const struct Qdisc_class_ops *cops; struct tcf_dump_args arg; - if (net != &init_net) - return 0; - if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) return skb->len; - if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) + if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) return skb->len; if (!tcm->tcm_parent) diff -urNp linux-2.6.32.48/net/sched/cls_flow.c linux-2.6.32.48-openvz/net/sched/cls_flow.c --- linux-2.6.32.48/net/sched/cls_flow.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/cls_flow.c 2011-11-21 17:40:47.000000000 -0500 @@ -601,7 +601,6 @@ static unsigned long flow_get(struct tcf static void flow_put(struct tcf_proto *tp, unsigned long f) { - return; } static int flow_dump(struct tcf_proto *tp, unsigned long fh, diff -urNp linux-2.6.32.48/net/sched/em_meta.c linux-2.6.32.48-openvz/net/sched/em_meta.c --- linux-2.6.32.48/net/sched/em_meta.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/em_meta.c 2011-11-21 17:40:47.000000000 -0500 @@ -309,7 +309,7 @@ META_COLLECTOR(var_sk_bound_if) } else { struct net_device *dev; - dev = dev_get_by_index(&init_net, skb->sk->sk_bound_dev_if); + dev = dev_get_by_index(sock_net(skb->sk), skb->sk->sk_bound_dev_if); *err = var_dev(dev, dst); if (dev) dev_put(dev); diff -urNp linux-2.6.32.48/net/sched/sch_api.c linux-2.6.32.48-openvz/net/sched/sch_api.c --- linux-2.6.32.48/net/sched/sch_api.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_api.c 2011-11-21 17:40:47.000000000 -0500 @@ -34,10 +34,12 @@ #include #include -static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, +static int qdisc_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new); -static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, - struct Qdisc *q, unsigned long cl, int event); +static int tclass_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct Qdisc *q, + unsigned long cl, int event); /* @@ -638,11 +640,12 @@ void qdisc_tree_decrease_qlen(struct Qdi } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); -static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid, +static void notify_and_destroy(struct net *net, struct sk_buff *skb, + struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new) { if (new || old) - qdisc_notify(skb, n, clid, old, new); + qdisc_notify(net, skb, n, clid, old, new); if (old) qdisc_destroy(old); @@ -662,6 +665,7 @@ static int qdisc_graft(struct net_device struct Qdisc *new, struct Qdisc *old) { struct Qdisc *q = old; + struct net *net = dev_net(dev); int err = 0; if (parent == NULL) { @@ -698,12 +702,13 @@ static int qdisc_graft(struct net_device } if (!ingress) { - notify_and_destroy(skb, n, classid, dev->qdisc, new); + notify_and_destroy(net, skb, n, classid, + dev->qdisc, new); if (new && !new->ops->attach) atomic_inc(&new->refcnt); dev->qdisc = new ? : &noop_qdisc; } else { - notify_and_destroy(skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new); } if (dev->flags & IFF_UP) @@ -721,7 +726,7 @@ static int qdisc_graft(struct net_device err = -ENOENT; } if (!err) - notify_and_destroy(skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new); } return err; } @@ -947,10 +952,7 @@ static int tc_get_qdisc(struct sk_buff * struct Qdisc *p = NULL; int err; - if (net != &init_net) - return -EINVAL; - - if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) + if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -990,7 +992,7 @@ static int tc_get_qdisc(struct sk_buff * if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0) return err; } else { - qdisc_notify(skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q); } return 0; } @@ -1009,16 +1011,13 @@ static int tc_modify_qdisc(struct sk_buf struct Qdisc *q, *p; int err; - if (net != &init_net) - return -EINVAL; - replay: /* Reinit, just in case something touches this. */ tcm = NLMSG_DATA(n); clid = tcm->tcm_parent; q = p = NULL; - if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) + if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1105,7 +1104,7 @@ replay: return -EINVAL; err = qdisc_change(q, tca); if (err == 0) - qdisc_notify(skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q); return err; create_n_graft: @@ -1200,8 +1199,9 @@ static bool tc_qdisc_dump_ignore(struct return (q->flags & TCQ_F_BUILTIN) ? true : false; } -static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, - u32 clid, struct Qdisc *old, struct Qdisc *new) +static int qdisc_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new) { struct sk_buff *skb; u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; @@ -1220,7 +1220,7 @@ static int qdisc_notify(struct sk_buff * } if (skb->len) - return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); err_out: kfree_skb(skb); @@ -1274,14 +1274,11 @@ static int tc_dump_qdisc(struct sk_buff int s_idx, s_q_idx; struct net_device *dev; - if (net != &init_net) - return 0; - s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; read_lock(&dev_base_lock); idx = 0; - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { struct netdev_queue *dev_queue; if (idx < s_idx) @@ -1333,10 +1330,7 @@ static int tc_ctl_tclass(struct sk_buff u32 qid = TC_H_MAJ(clid); int err; - if (net != &init_net) - return -EINVAL; - - if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) + if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1417,10 +1411,10 @@ static int tc_ctl_tclass(struct sk_buff if (cops->delete) err = cops->delete(q, cl); if (err == 0) - tclass_notify(skb, n, q, cl, RTM_DELTCLASS); + tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS); goto out; case RTM_GETTCLASS: - err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); + err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); goto out; default: err = -EINVAL; @@ -1433,7 +1427,7 @@ static int tc_ctl_tclass(struct sk_buff if (cops->change) err = cops->change(q, clid, pid, tca, &new_cl); if (err == 0) - tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); + tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); out: if (cl) @@ -1485,8 +1479,9 @@ nla_put_failure: return -1; } -static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, - struct Qdisc *q, unsigned long cl, int event) +static int tclass_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct Qdisc *q, + unsigned long cl, int event) { struct sk_buff *skb; u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; @@ -1500,7 +1495,7 @@ static int tclass_notify(struct sk_buff return -EINVAL; } - return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); } struct qdisc_dump_args @@ -1575,12 +1570,9 @@ static int tc_dump_tclass(struct sk_buff struct net_device *dev; int t, s_t; - if (net != &init_net) - return 0; - if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) return 0; - if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) + if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) return 0; s_t = cb->args[0]; @@ -1690,7 +1682,7 @@ static int psched_show(struct seq_file * static int psched_open(struct inode *inode, struct file *file) { - return single_open(file, psched_show, PDE(inode)->data); + return single_open(file, psched_show, NULL); } static const struct file_operations psched_fops = { @@ -1700,14 +1692,52 @@ static const struct file_operations psch .llseek = seq_lseek, .release = single_release, }; + +static int __net_init psched_net_init(struct net *net) +{ + struct proc_dir_entry *e; + + e = proc_net_fops_create(net, "psched", 0, &psched_fops); + if (e == NULL) + return -ENOMEM; + + return 0; +} + +static void __net_exit psched_net_exit(struct net *net) +{ + proc_net_remove(net, "psched"); +} +#else +static int __net_init psched_net_init(struct net *net) +{ + return 0; +} + +static void __net_exit psched_net_exit(struct net *net) +{ +} #endif +static struct pernet_operations psched_net_ops = { + .init = psched_net_init, + .exit = psched_net_exit, +}; + static int __init pktsched_init(void) { + int err; + + err = register_pernet_subsys(&psched_net_ops); + if (err) { + printk(KERN_ERR "pktsched_init: " + "cannot initialize per netns operations\n"); + return err; + } + register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&mq_qdisc_ops); - proc_net_fops_create(&init_net, "psched", 0, &psched_fops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL); diff -urNp linux-2.6.32.48/net/sched/sch_cbq.c linux-2.6.32.48-openvz/net/sched/sch_cbq.c --- linux-2.6.32.48/net/sched/sch_cbq.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_cbq.c 2011-11-21 17:40:47.000000000 -0500 @@ -873,8 +873,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int if (cl->deficit <= 0) { q->active[prio] = cl; - cl = cl->next_alive; cl->deficit += cl->quantum; + cl = cl->next_alive; } return skb; @@ -1047,17 +1047,19 @@ static void cbq_normalize_quanta(struct for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + long mtu; /* BUGGGG... Beware! This expression suffer of arithmetic overflows! */ if (cl->priority == prio) { - cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ - q->quanta[prio]; - } - if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { - printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum); - cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; + cl->quantum = (cl->weight * cl->allot) / + (q->quanta[prio] / q->nclasses[prio]); } + mtu = qdisc_dev(cl->qdisc)->mtu; + if (cl->quantum <= mtu/2) + cl->quantum = mtu/2 + 1; + else if (cl->quantum > 32*mtu) + cl->quantum = 32*mtu; } } } diff -urNp linux-2.6.32.48/net/sched/sch_generic.c linux-2.6.32.48-openvz/net/sched/sch_generic.c --- linux-2.6.32.48/net/sched/sch_generic.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_generic.c 2011-11-21 17:40:47.000000000 -0500 @@ -179,17 +179,23 @@ static inline int qdisc_restart(struct Q struct net_device *dev; spinlock_t *root_lock; struct sk_buff *skb; + int ret; + struct ve_struct *old_ve; /* Dequeue packet */ skb = dequeue_skb(q); if (unlikely(!skb)) return 0; + old_ve = set_exec_env(skb->owner_env); root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - return sch_direct_xmit(skb, q, dev, txq, root_lock); + ret = sch_direct_xmit(skb, q, dev, txq, root_lock); + (void)set_exec_env(old_ve); + + return ret; } void __qdisc_run(struct Qdisc *q) diff -urNp linux-2.6.32.48/net/sched/sch_ingress.c linux-2.6.32.48-openvz/net/sched/sch_ingress.c --- linux-2.6.32.48/net/sched/sch_ingress.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_ingress.c 2011-11-21 17:40:47.000000000 -0500 @@ -44,7 +44,6 @@ static void ingress_put(struct Qdisc *sc static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { - return; } static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) diff -urNp linux-2.6.32.48/net/sched/sch_mq.c linux-2.6.32.48-openvz/net/sched/sch_mq.c --- linux-2.6.32.48/net/sched/sch_mq.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_mq.c 2011-11-21 17:40:47.000000000 -0500 @@ -173,7 +173,6 @@ static unsigned long mq_get(struct Qdisc static void mq_put(struct Qdisc *sch, unsigned long cl) { - return; } static int mq_dump_class(struct Qdisc *sch, unsigned long cl, diff -urNp linux-2.6.32.48/net/sched/sch_multiq.c linux-2.6.32.48-openvz/net/sched/sch_multiq.c --- linux-2.6.32.48/net/sched/sch_multiq.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_multiq.c 2011-11-21 17:40:47.000000000 -0500 @@ -339,7 +339,6 @@ static unsigned long multiq_bind(struct static void multiq_put(struct Qdisc *q, unsigned long cl) { - return; } static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, diff -urNp linux-2.6.32.48/net/sched/sch_prio.c linux-2.6.32.48-openvz/net/sched/sch_prio.c --- linux-2.6.32.48/net/sched/sch_prio.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_prio.c 2011-11-21 17:40:47.000000000 -0500 @@ -302,7 +302,6 @@ static unsigned long prio_bind(struct Qd static void prio_put(struct Qdisc *q, unsigned long cl) { - return; } static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, diff -urNp linux-2.6.32.48/net/sched/sch_red.c linux-2.6.32.48-openvz/net/sched/sch_red.c --- linux-2.6.32.48/net/sched/sch_red.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_red.c 2011-11-21 17:40:47.000000000 -0500 @@ -303,7 +303,6 @@ static unsigned long red_get(struct Qdis static void red_put(struct Qdisc *sch, unsigned long arg) { - return; } static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker) diff -urNp linux-2.6.32.48/net/sched/sch_teql.c linux-2.6.32.48-openvz/net/sched/sch_teql.c --- linux-2.6.32.48/net/sched/sch_teql.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sched/sch_teql.c 2011-11-21 17:40:47.000000000 -0500 @@ -178,6 +178,9 @@ static int teql_qdisc_init(struct Qdisc struct teql_master *m = (struct teql_master*)sch->ops; struct teql_sched_data *q = qdisc_priv(sch); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (dev->hard_header_len > m->dev->hard_header_len) return -EINVAL; diff -urNp linux-2.6.32.48/net/sctp/ulpevent.c linux-2.6.32.48-openvz/net/sctp/ulpevent.c --- linux-2.6.32.48/net/sctp/ulpevent.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sctp/ulpevent.c 2011-11-21 17:40:47.000000000 -0500 @@ -701,7 +701,7 @@ struct sctp_ulpevent *sctp_ulpevent_make if (rx_count >= asoc->base.sk->sk_rcvbuf) { if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || - (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) + (!sk_rmem_schedule(asoc->base.sk, chunk->skb))) goto fail; } diff -urNp linux-2.6.32.48/net/socket.c linux-2.6.32.48-openvz/net/socket.c --- linux-2.6.32.48/net/socket.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/socket.c 2011-11-21 17:40:47.000000000 -0500 @@ -85,6 +85,7 @@ #include #include #include +#include #include #include @@ -162,15 +163,6 @@ static DEFINE_PER_CPU(int, sockets_in_us * divide and look after the messy bits. */ -#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - - 16 for IP, 16 for IPX, - 24 for IPv6, - about 80 for AX.25 - must be at least one bigger than - the AF_UNIX size (see net/unix/af_unix.c - :unix_mkname()). - */ - /** * move_addr_to_kernel - copy a socket address into kernel space * @uaddr: Address in user space @@ -192,6 +184,7 @@ int move_addr_to_kernel(void __user *uad return -EFAULT; return audit_sockaddr(ulen, kaddr); } +EXPORT_SYMBOL(move_addr_to_kernel); /** * move_addr_to_user - copy an address to user space @@ -497,6 +490,8 @@ static struct socket *sock_alloc(void) return sock; } +EXPORT_SYMBOL(sock_alloc); + /* * In theory you can't get an open on this inode, but /proc provides * a back door. Remember to keep it shut otherwise you'll let the @@ -524,6 +519,9 @@ const struct file_operations bad_sock_fo void sock_release(struct socket *sock) { + if (sock->sk) + ub_sock_sndqueuedel(sock->sk); + if (sock->ops) { struct module *owner = sock->ops->owner; @@ -1140,6 +1138,54 @@ call_kill: return 0; } +int vz_security_family_check(int family) +{ +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + return 0; + + switch (family) { + case PF_UNSPEC: + case PF_PACKET: + case PF_NETLINK: + case PF_UNIX: + case PF_INET: + case PF_INET6: + case PF_PPPOX: + case PF_KEY: + break; + default: + return -EAFNOSUPPORT; + } +#endif + return 0; +} +EXPORT_SYMBOL_GPL(vz_security_family_check); + +int vz_security_protocol_check(int protocol) +{ +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + return 0; + + switch (protocol) { + case IPPROTO_IP: + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_RAW: + case IPPROTO_DCCP: + case IPPROTO_GRE: + case IPPROTO_ESP: + case IPPROTO_AH: + break; + default: + return -EAFNOSUPPORT; + } +#endif + return 0; +} +EXPORT_SYMBOL_GPL(vz_security_protocol_check); + static int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { @@ -1170,6 +1216,11 @@ static int __sock_create(struct net *net family = PF_PACKET; } + /* VZ compatibility layer */ + err = vz_security_family_check(family); + if (err < 0) + return err; + err = security_socket_create(family, type, protocol, kern); if (err) return err; @@ -2423,9 +2474,12 @@ int kernel_sock_ioctl(struct socket *soc { mm_segment_t oldfs = get_fs(); int err; + struct ve_struct *old_env; set_fs(KERNEL_DS); + old_env = set_exec_env(sock->sk->owner_env); err = sock->ops->ioctl(sock, cmd, arg); + (void)set_exec_env(old_env); set_fs(oldfs); return err; diff -urNp linux-2.6.32.48/net/sunrpc/clnt.c linux-2.6.32.48-openvz/net/sunrpc/clnt.c --- linux-2.6.32.48/net/sunrpc/clnt.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sunrpc/clnt.c 2011-11-21 17:40:47.000000000 -0500 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -95,6 +96,38 @@ static void rpc_unregister_client(struct spin_unlock(&rpc_client_lock); } +/* + * Grand abort timeout (stop the client if occures) + */ +int xprt_abort_timeout = RPC_MAX_ABORT_TIMEOUT; + +static int rpc_abort_hard(struct rpc_task *task) +{ + struct rpc_clnt *clnt; + clnt = task->tk_client; + + if (clnt->cl_pr_time == 0) { + clnt->cl_pr_time = jiffies; + return 0; + } + if (xprt_abort_timeout == RPC_MAX_ABORT_TIMEOUT) + return 0; + if (time_before(jiffies, clnt->cl_pr_time + xprt_abort_timeout * HZ)) + return 0; + + printk(KERN_ERR "CT#%u: RPC client %p (server %s) is marked 'broken'. " + "Unmount/mount to get it working again.\n", + get_exec_env()->veid, clnt, clnt->cl_server); + clnt->cl_broken = 1; + rpc_killall_tasks(clnt); + return -ETIMEDOUT; +} + +static void rpc_abort_clear(struct rpc_task *task) +{ + task->tk_client->cl_pr_time = 0; +} + static int rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { @@ -200,6 +233,7 @@ static struct rpc_clnt * rpc_new_client( clnt->cl_vers = version->number; clnt->cl_stats = program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); + clnt->cl_broken = 0; err = -ENOMEM; if (clnt->cl_metrics == NULL) goto out_no_stats; @@ -336,8 +370,10 @@ struct rpc_clnt *rpc_create(struct rpc_c xprt->resvport = 0; clnt = rpc_new_client(args, xprt); - if (IS_ERR(clnt)) + if (IS_ERR(clnt)) { + put_ve(xprt->owner_env); return clnt; + } if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { int err = rpc_ping(clnt, RPC_TASK_SOFT); @@ -1034,6 +1070,7 @@ call_bind_status(struct rpc_task *task) if (task->tk_status >= 0) { dprint_status(task); + rpc_abort_clear(task); task->tk_status = 0; task->tk_action = call_connect; return; @@ -1060,6 +1097,10 @@ call_bind_status(struct rpc_task *task) case -ETIMEDOUT: dprintk("RPC: %5u rpcbind request timed out\n", task->tk_pid); + if (rpc_abort_hard(task)) { + status = -EIO; + break; + } goto retry_timeout; case -EPFNOSUPPORT: /* server doesn't support any rpcbind version we know of */ @@ -1117,7 +1158,8 @@ call_connect_status(struct rpc_task *tas dprint_status(task); task->tk_status = 0; - if (status >= 0 || status == -EAGAIN) { + if (status >= 0 || + (status == -EAGAIN && !rpc_abort_hard(task))) { clnt->cl_stats->netreconn++; task->tk_action = call_transmit; return; @@ -1325,8 +1367,8 @@ call_status(struct rpc_task *task) break; default: if (clnt->cl_chatty) - printk("%s: RPC call returned error %d\n", - clnt->cl_protname, -status); + printk("ct%d %s: RPC call returned error %d\n", + get_exec_env()->veid, clnt->cl_protname, -status); rpc_exit(task, status); } } @@ -1349,10 +1391,10 @@ call_timeout(struct rpc_task *task) dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid); task->tk_timeouts++; - if (RPC_IS_SOFT(task)) { + if (RPC_IS_SOFT(task) || rpc_abort_hard(task)) { if (clnt->cl_chatty) - printk(KERN_NOTICE "%s: server %s not responding, timed out\n", - clnt->cl_protname, clnt->cl_server); + printk(KERN_NOTICE "ct%d %s: server %s not responding, timed out\n", + get_exec_env()->veid, clnt->cl_protname, clnt->cl_server); rpc_exit(task, -EIO); return; } @@ -1360,8 +1402,8 @@ call_timeout(struct rpc_task *task) if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { task->tk_flags |= RPC_CALL_MAJORSEEN; if (clnt->cl_chatty) - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", - clnt->cl_protname, clnt->cl_server); + printk(KERN_NOTICE "ct%d %s: server %s not responding, still trying\n", + get_exec_env()->veid, clnt->cl_protname, clnt->cl_server); } rpc_force_rebind(clnt); /* @@ -1392,11 +1434,12 @@ call_decode(struct rpc_task *task) if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) - printk(KERN_NOTICE "%s: server %s OK\n", - clnt->cl_protname, clnt->cl_server); + printk(KERN_NOTICE "ct%d %s: server %s OK\n", + get_exec_env()->veid, clnt->cl_protname, clnt->cl_server); task->tk_flags &= ~RPC_CALL_MAJORSEEN; } + rpc_abort_clear(task); /* * Ensure that we see all writes made by xprt_complete_rqst() * before it changed req->rq_reply_bytes_recvd. @@ -1409,7 +1452,7 @@ call_decode(struct rpc_task *task) sizeof(req->rq_rcv_buf)) != 0); if (req->rq_rcv_buf.len < 12) { - if (!RPC_IS_SOFT(task)) { + if (!RPC_IS_SOFT(task) && !rpc_abort_hard(task)) { task->tk_action = call_bind; clnt->cl_stats->rpcretrans++; goto out_retry; @@ -1757,3 +1800,54 @@ void rpc_show_tasks(void) spin_unlock(&rpc_client_lock); } #endif + +#ifdef CONFIG_VE +static int ve_sunrpc_start(void *data) +{ + return 0; +} + +void ve_sunrpc_stop(void *data) +{ + struct ve_struct *ve = (struct ve_struct *)data; + struct rpc_clnt *clnt; + + dprintk("RPC: killing all tasks for VE %d\n", ve->veid); + + spin_lock(&rpc_client_lock); + list_for_each_entry(clnt, &all_clients, cl_clients) { + if (clnt->cl_xprt->owner_env != ve) + continue; + + rpc_killall_tasks(clnt); + } + spin_unlock(&rpc_client_lock); + + flush_scheduled_work(); +} + +static struct ve_hook sunrpc_hook = { + .init = ve_sunrpc_start, + .fini = ve_sunrpc_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_PRE, +}; + +void ve_sunrpc_hook_register(void) +{ + ve_hook_register(VE_SS_CHAIN, &sunrpc_hook); +} + +void ve_sunrpc_hook_unregister(void) +{ + ve_hook_unregister(&sunrpc_hook); +} +#else +void ve_sunrpc_hook_register(void) +{ +} + +void ve_sunrpc_hook_unregister(void) +{ +} +#endif diff -urNp linux-2.6.32.48/net/sunrpc/rpc_pipe.c linux-2.6.32.48-openvz/net/sunrpc/rpc_pipe.c --- linux-2.6.32.48/net/sunrpc/rpc_pipe.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sunrpc/rpc_pipe.c 2011-11-21 17:40:47.000000000 -0500 @@ -1028,6 +1028,7 @@ static struct file_system_type rpc_pipe_ .name = "rpc_pipefs", .get_sb = rpc_get_sb, .kill_sb = kill_litter_super, + .fs_flags = FS_VIRTUALIZED, }; static void diff -urNp linux-2.6.32.48/net/sunrpc/sched.c linux-2.6.32.48-openvz/net/sunrpc/sched.c --- linux-2.6.32.48/net/sunrpc/sched.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sunrpc/sched.c 2011-11-21 17:40:47.000000000 -0500 @@ -52,6 +52,8 @@ static struct rpc_wait_queue delay_queue * rpciod-related stuff */ struct workqueue_struct *rpciod_workqueue; +DECLARE_RWSEM(rpc_async_task_lock); +EXPORT_SYMBOL(rpc_async_task_lock); /* * Disable the timer for a given RPC task. Should be called with @@ -606,7 +608,9 @@ static void __rpc_execute(struct rpc_tas struct rpc_wait_queue *queue; int task_is_async = RPC_IS_ASYNC(task); int status = 0; + struct ve_struct *env; + env = set_exec_env(task->tk_client->cl_xprt->owner_env); dprintk("RPC: %5u __rpc_execute flags=0x%x\n", task->tk_pid, task->tk_flags); @@ -655,8 +659,10 @@ static void __rpc_execute(struct rpc_tas } rpc_clear_running(task); spin_unlock_bh(&queue->lock); - if (task_is_async) + if (task_is_async) { + (void)set_exec_env(env); return; + } /* sync task: sleep here */ dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid); @@ -683,6 +689,7 @@ static void __rpc_execute(struct rpc_tas task->tk_status); /* Release all resources associated with the task */ rpc_release_task(task); + (void)set_exec_env(env); } /* @@ -703,7 +710,9 @@ void rpc_execute(struct rpc_task *task) static void rpc_async_schedule(struct work_struct *work) { + down_read(&rpc_async_task_lock); __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); + up_read(&rpc_async_task_lock); } /** @@ -940,6 +949,16 @@ void rpc_killall_tasks(struct rpc_clnt * } EXPORT_SYMBOL_GPL(rpc_killall_tasks); +void rpc_kill_client(struct rpc_clnt *clnt) +{ + if (!IS_ERR(clnt)) { + clnt->cl_broken = 1; + clnt->cl_pr_time = jiffies - xprt_abort_timeout * HZ - 1; + rpc_killall_tasks(clnt); + } +} +EXPORT_SYMBOL_GPL(rpc_kill_client); + int rpciod_up(void) { return try_module_get(THIS_MODULE) ? 0 : -EINVAL; diff -urNp linux-2.6.32.48/net/sunrpc/sunrpc_syms.c linux-2.6.32.48-openvz/net/sunrpc/sunrpc_syms.c --- linux-2.6.32.48/net/sunrpc/sunrpc_syms.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sunrpc/sunrpc_syms.c 2011-11-21 17:40:47.000000000 -0500 @@ -24,6 +24,9 @@ extern struct cache_detail ip_map_cache, unix_gid_cache; +extern void ve_sunrpc_hook_register(void); +extern void ve_sunrpc_hook_unregister(void); + static int __init init_sunrpc(void) { @@ -46,6 +49,7 @@ init_sunrpc(void) svc_init_xprt_sock(); /* svc sock transport */ init_socket_xprt(); /* clnt sock transport */ rpcauth_init_module(); + ve_sunrpc_hook_register(); out: return err; } @@ -53,6 +57,7 @@ out: static void __exit cleanup_sunrpc(void) { + ve_sunrpc_hook_unregister(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); diff -urNp linux-2.6.32.48/net/sunrpc/svcsock.c linux-2.6.32.48-openvz/net/sunrpc/svcsock.c --- linux-2.6.32.48/net/sunrpc/svcsock.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sunrpc/svcsock.c 2011-11-21 17:40:47.000000000 -0500 @@ -229,6 +229,9 @@ static int svc_sendto(struct svc_rqst *r unsigned long tailoff; unsigned long headoff; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + struct ve_struct *old_env; + + old_env = set_exec_env(sock->sk->owner_env); if (rqstp->rq_prot == IPPROTO_UDP) { struct msghdr msg = { @@ -255,6 +258,8 @@ out: svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); + (void)set_exec_env(old_env); + return len; } @@ -1437,8 +1442,9 @@ static struct svc_xprt *svc_create_socke error = sock_create_kern(family, type, protocol, &sock); if (error < 0) - return ERR_PTR(error); + return ERR_PTR(-ENOMEM); + sk_change_net_get(sock->sk, get_exec_env()->ve_netns); svc_reclassify_socket(sock); /* @@ -1489,6 +1495,8 @@ static void svc_sock_detach(struct svc_x dprintk("svc: svc_sock_detach(%p)\n", svsk); + /* XXX: serialization? */ + sk->sk_user_data = NULL; /* put back the old socket callbacks */ sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; diff -urNp linux-2.6.32.48/net/sunrpc/xprt.c linux-2.6.32.48-openvz/net/sunrpc/xprt.c --- linux-2.6.32.48/net/sunrpc/xprt.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sunrpc/xprt.c 2011-11-21 17:40:47.000000000 -0500 @@ -598,10 +598,13 @@ static void xprt_autoclose(struct work_s { struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); xprt->ops->close(xprt); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt_release_write(xprt, NULL); + (void)set_exec_env(ve); } /** @@ -668,7 +671,9 @@ static void xprt_init_autodisconnect(unsigned long data) { struct rpc_xprt *xprt = (struct rpc_xprt *)data; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; @@ -677,9 +682,11 @@ xprt_init_autodisconnect(unsigned long d spin_unlock(&xprt->transport_lock); set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); queue_work(rpciod_workqueue, &xprt->task_cleanup); + (void)set_exec_env(ve); return; out_abort: spin_unlock(&xprt->transport_lock); + (void)set_exec_env(ve); } /** @@ -1095,6 +1102,7 @@ found: xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; xprt->bind_index = 0; + xprt->owner_env = get_ve(get_exec_env()); rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); @@ -1146,6 +1154,7 @@ void xprt_put(struct rpc_xprt *xprt) { kref_put(&xprt->kref, xprt_destroy); } +EXPORT_SYMBOL(xprt_put); /** * xprt_get - return a reference to an RPC transport. @@ -1157,3 +1166,4 @@ struct rpc_xprt *xprt_get(struct rpc_xpr kref_get(&xprt->kref); return xprt; } +EXPORT_SYMBOL(xprt_get); diff -urNp linux-2.6.32.48/net/sunrpc/xprtrdma/transport.c linux-2.6.32.48-openvz/net/sunrpc/xprtrdma/transport.c --- linux-2.6.32.48/net/sunrpc/xprtrdma/transport.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sunrpc/xprtrdma/transport.c 2011-11-21 17:40:47.000000000 -0500 @@ -269,6 +269,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) kfree(xprt->slot); xprt->slot = NULL; + put_ve(xprt->owner_env); kfree(xprt); dprintk("RPC: %s: returning\n", __func__); diff -urNp linux-2.6.32.48/net/sunrpc/xprtsock.c linux-2.6.32.48-openvz/net/sunrpc/xprtsock.c --- linux-2.6.32.48/net/sunrpc/xprtsock.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/sunrpc/xprtsock.c 2011-11-21 17:40:47.000000000 -0500 @@ -72,6 +72,8 @@ static unsigned int min_slot_table_size static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; +static int xprt_min_abort_timeout = RPC_MIN_ABORT_TIMEOUT; +static int xprt_max_abort_timeout = RPC_MAX_ABORT_TIMEOUT; static struct ctl_table_header *sunrpc_table_header; @@ -125,6 +127,16 @@ static ctl_table xs_tunables_table[] = { .extra2 = &xprt_max_resvport_limit }, { + .procname = "abort_timeout", + .data = &xprt_abort_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_abort_timeout, + .extra2 = &xprt_max_abort_timeout + }, + { .procname = "tcp_fin_timeout", .data = &xs_tcp_fin_timeout, .maxlen = sizeof(xs_tcp_fin_timeout), @@ -737,18 +749,23 @@ static void xs_restore_old_callbacks(str static void xs_reset_transport(struct sock_xprt *transport) { - struct socket *sock = transport->sock; - struct sock *sk = transport->inet; + struct rpc_xprt *xprt = &transport->xprt; + struct socket *sock; + struct sock *sk; - if (sk == NULL) + spin_lock_bh(&xprt->transport_lock); + if (transport->sock == NULL) { + spin_unlock_bh(&xprt->transport_lock); return; - - transport->srcport = 0; - - write_lock_bh(&sk->sk_callback_lock); + } + sock = transport->sock; + sk = transport->inet; transport->inet = NULL; transport->sock = NULL; + transport->srcport = 0; + spin_unlock_bh(&xprt->transport_lock); + write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; xs_restore_old_callbacks(transport, sk); @@ -810,6 +827,7 @@ static void xs_destroy(struct rpc_xprt * xs_close(xprt); xs_free_peer_addresses(xprt); kfree(xprt->slot); + put_ve(xprt->owner_env); kfree(xprt); module_put(THIS_MODULE); } @@ -1711,7 +1729,12 @@ static void xs_udp_connect_worker4(struc struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown) goto out; @@ -1723,6 +1746,7 @@ static void xs_udp_connect_worker4(struc dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } + sk_change_net_get(sock->sk, xprt->owner_env->ve_netns); xs_reclassify_socket4(sock); if (xs_bind4(transport, sock)) { @@ -1741,6 +1765,8 @@ static void xs_udp_connect_worker4(struc out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /** @@ -1756,7 +1782,12 @@ static void xs_udp_connect_worker6(struc struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown) goto out; @@ -1768,6 +1799,7 @@ static void xs_udp_connect_worker6(struc dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } + sk_change_net_get(sock->sk, xprt->owner_env->ve_netns); xs_reclassify_socket6(sock); if (xs_bind6(transport, sock) < 0) { @@ -1786,6 +1818,8 @@ static void xs_udp_connect_worker6(struc out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /* @@ -1907,7 +1941,12 @@ static void xs_tcp_setup_socket(struct r { struct socket *sock = transport->sock; int status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown) goto out; @@ -1959,6 +1998,8 @@ static void xs_tcp_setup_socket(struct r case -EINPROGRESS: case -EALREADY: xprt_clear_connecting(xprt); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); return; case -EINVAL: /* Happens, for instance, if the user specified a link @@ -1971,6 +2012,8 @@ out_eagain: out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt, @@ -1986,6 +2029,7 @@ static struct socket *xs_create_tcp_sock -err); goto out_err; } + sk_change_net_get(sock->sk, xprt->owner_env->ve_netns); xs_reclassify_socket4(sock); if (xs_bind4(transport, sock) < 0) { @@ -2025,6 +2069,7 @@ static struct socket *xs_create_tcp_sock -err); goto out_err; } + sk_change_net_get(sock->sk, xprt->owner_env->ve_netns); xs_reclassify_socket6(sock); if (xs_bind6(transport, sock) < 0) { diff -urNp linux-2.6.32.48/net/unix/af_unix.c linux-2.6.32.48-openvz/net/unix/af_unix.c --- linux-2.6.32.48/net/unix/af_unix.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/unix/af_unix.c 2011-11-21 17:40:47.000000000 -0500 @@ -115,6 +115,9 @@ #include #include +#include +#include + static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); @@ -292,9 +295,6 @@ static struct sock *unix_find_socket_byi &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; - if (!net_eq(sock_net(s), net)) - continue; - if (dentry && dentry->d_inode == i) { sock_hold(s); goto found; @@ -595,6 +595,8 @@ static struct sock *unix_create1(struct sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); if (!sk) goto out; + if (ub_other_sock_charge(sk)) + goto out_sk_free; sock_init_data(sock, sk); lockdep_set_class(&sk->sk_receive_queue.lock, @@ -621,6 +623,10 @@ out: local_bh_enable(); } return sk; +out_sk_free: + sk_free(sk); + atomic_dec(&unix_nr_socks); + return NULL; } static int unix_create(struct net *net, struct socket *sock, int protocol) @@ -1037,6 +1043,7 @@ static int unix_stream_connect(struct so int st; int err; long timeo; + unsigned long chargesize; err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) @@ -1065,6 +1072,10 @@ static int unix_stream_connect(struct so skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (skb == NULL) goto out; + chargesize = skb_charge_fullsize(skb); + if (ub_sock_getwres_other(newsk, chargesize) < 0) + goto out; + ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF); restart: /* Find listening sock. */ @@ -1313,7 +1324,7 @@ static void unix_detach_fds(struct scm_c unix_notinflight(scm->fp->fp[i]); } -static void unix_destruct_fds(struct sk_buff *skb) +void unix_destruct_fds(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); @@ -1324,6 +1335,7 @@ static void unix_destruct_fds(struct sk_ scm_destroy(&scm); sock_wfree(skb); } +EXPORT_SYMBOL_GPL(unix_destruct_fds); #define MAX_RECURSION_LEVEL 4 @@ -1572,6 +1584,16 @@ static int unix_stream_sendmsg(struct ki size = len-sent; + if (msg->msg_flags & MSG_DONTWAIT) + ub_sock_makewres_other(sk, skb_charge_size(size)); + if (sock_bc(sk) != NULL && + sock_bc(sk)->poll_reserv >= + SOCK_MIN_UBCSPACE && + skb_charge_size(size) > + sock_bc(sk)->poll_reserv) + size = skb_charge_datalen(sock_bc(sk)->poll_reserv); + + /* Keep two messages in the pipe so it schedules better */ if (size > ((sk->sk_sndbuf >> 1) - 64)) size = (sk->sk_sndbuf >> 1) - 64; @@ -1583,8 +1605,9 @@ static int unix_stream_sendmsg(struct ki * Grab a buffer */ - skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, - &err); + + skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE, + msg->msg_flags&MSG_DONTWAIT, &err); if (skb == NULL) goto out_err; @@ -2039,6 +2062,7 @@ static unsigned int unix_poll(struct fil { struct sock *sk = sock->sk; unsigned int mask; + int no_ub_res; sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; @@ -2051,6 +2075,10 @@ static unsigned int unix_poll(struct fil if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; + no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); + if (no_ub_res) + ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); + /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) @@ -2065,7 +2093,7 @@ static unsigned int unix_poll(struct fil * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ - if (unix_writable(sk)) + if (!no_ub_res && unix_writable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; diff -urNp linux-2.6.32.48/net/unix/garbage.c linux-2.6.32.48-openvz/net/unix/garbage.c --- linux-2.6.32.48/net/unix/garbage.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/unix/garbage.c 2011-11-21 17:40:47.000000000 -0500 @@ -81,6 +81,7 @@ #include #include #include +#include #include #include @@ -153,6 +154,7 @@ void unix_notinflight(struct file *fp) spin_unlock(&unix_gc_lock); } } +EXPORT_SYMBOL_GPL(unix_notinflight); static inline struct sk_buff *sock_queue_head(struct sock *sk) { diff -urNp linux-2.6.32.48/net/xfrm/xfrm_user.c linux-2.6.32.48-openvz/net/xfrm/xfrm_user.c --- linux-2.6.32.48/net/xfrm/xfrm_user.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/net/xfrm/xfrm_user.c 2011-11-21 17:40:47.000000000 -0500 @@ -2005,7 +2005,7 @@ static int xfrm_user_rcv_msg(struct sk_b link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || diff -urNp linux-2.6.32.48/scripts/kconfig/conf.c linux-2.6.32.48-openvz/scripts/kconfig/conf.c --- linux-2.6.32.48/scripts/kconfig/conf.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/scripts/kconfig/conf.c 2011-11-21 17:40:47.000000000 -0500 @@ -31,6 +31,7 @@ enum { } input_mode = ask_all; char *defconfig_file; +static int dont_ask = -1; static int indent = 1; static int valid_stdin = 1; static int sync_kconfig; @@ -99,6 +100,10 @@ static int conf_askvalue(struct symbol * printf("%s\n", def); return 0; } + if (dont_ask >= 0) { + dont_ask++; + break; + } check_stdin(); case ask_all: fflush(stdout); @@ -301,6 +306,10 @@ static int conf_choice(struct menu *menu printf("%d\n", cnt); break; } + if (dont_ask >= 0) { + dont_ask++; + break; + } check_stdin(); case ask_all: fflush(stdout); @@ -439,8 +448,10 @@ int main(int ac, char **av) bindtextdomain(PACKAGE, LOCALEDIR); textdomain(PACKAGE); - while ((opt = getopt(ac, av, "osdD:nmyrh")) != -1) { + while ((opt = getopt(ac, av, "obsdD:nmyrh")) != -1) { switch (opt) { + case 'b': + dont_ask = 0; case 'o': input_mode = ask_silent; break; @@ -613,5 +624,5 @@ int main(int ac, char **av) exit(1); } } - return 0; + return (dont_ask > 0) ? 1 : 0; } diff -urNp linux-2.6.32.48/scripts/kconfig/Makefile linux-2.6.32.48-openvz/scripts/kconfig/Makefile --- linux-2.6.32.48/scripts/kconfig/Makefile 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/scripts/kconfig/Makefile 2011-11-21 17:40:47.000000000 -0500 @@ -3,7 +3,7 @@ # These targets are used from top-level makefile PHONY += oldconfig xconfig gconfig menuconfig config silentoldconfig update-po-config \ - localmodconfig localyesconfig + localmodconfig localyesconfig nonint_oldconfig ifdef KBUILD_KCONFIG Kconfig := $(KBUILD_KCONFIG) @@ -58,6 +58,9 @@ localyesconfig: $(obj)/streamline_config fi $(Q)rm -f .tmp.config +nonint_oldconfig: $(obj)/conf + $< -b $(Kconfig) + # Create new linux.pot file # Adjust charset to UTF-8 in .po file to accept UTF-8 in Kconfig files # The symlink is used to repair a deficiency in arch/um diff -urNp linux-2.6.32.48/security/commoncap.c linux-2.6.32.48-openvz/security/commoncap.c --- linux-2.6.32.48/security/commoncap.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/security/commoncap.c 2011-11-21 17:40:47.000000000 -0500 @@ -58,6 +58,10 @@ int cap_netlink_send(struct sock *sk, st int cap_netlink_recv(struct sk_buff *skb, int cap) { + if (likely(cap == CAP_VE_NET_ADMIN) && + cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) + return 0; + if (!cap_raised(NETLINK_CB(skb).eff_cap, cap)) return -EPERM; return 0; @@ -618,7 +622,7 @@ int cap_inode_setxattr(struct dentry *de if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) return -EPERM; return 0; } @@ -644,7 +648,7 @@ int cap_inode_removexattr(struct dentry if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) return -EPERM; return 0; } @@ -962,8 +966,9 @@ error: */ int cap_syslog(int type) { - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if ((type != 3 && type != 10) && + !capable(CAP_VE_SYS_ADMIN) && !capable(CAP_SYS_ADMIN)) + return -EPERM; return 0; } diff -urNp linux-2.6.32.48/security/device_cgroup.c linux-2.6.32.48-openvz/security/device_cgroup.c --- linux-2.6.32.48/security/device_cgroup.c 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/security/device_cgroup.c 2011-11-21 17:40:47.000000000 -0500 @@ -12,11 +12,23 @@ #include #include #include +#include +#include +#include #define ACC_MKNOD 1 #define ACC_READ 2 #define ACC_WRITE 4 -#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) +#define ACC_QUOTA 8 +#define ACC_HIDDEN 16 +#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_QUOTA) + +static inline int convert_bits(int acc) +{ + /* ...10x <-> ...01x trial: guess hwy */ + return ((((acc & 06) == 00) || ((acc & 06) == 06)) ? acc : acc ^06) & + (ACC_READ | ACC_WRITE | ACC_QUOTA); +} #define DEV_BLOCK 1 #define DEV_CHAR 2 @@ -73,6 +85,38 @@ static int devcgroup_can_attach(struct c /* * called under devcgroup_mutex */ +#ifdef CONFIG_VE +static struct dev_whitelist_item default_whitelist_items[] = { + { ~0, ~0, DEV_ALL, ACC_MKNOD }, + { UNIX98_PTY_MASTER_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { UNIX98_PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { PTY_MASTER_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { MEM_MAJOR, /* null */ 3, DEV_CHAR, ACC_READ | ACC_WRITE }, + { MEM_MAJOR, /* zero */ 5, DEV_CHAR, ACC_READ | ACC_WRITE }, + { MEM_MAJOR, /* full */ 7, DEV_CHAR, ACC_READ | ACC_WRITE }, + { TTYAUX_MAJOR, /* tty */ 0, DEV_CHAR, ACC_READ | ACC_WRITE }, + { TTYAUX_MAJOR, /* ptmx */ 2, DEV_CHAR, ACC_READ | ACC_WRITE }, + { MEM_MAJOR, /* random */ 8, DEV_CHAR, ACC_READ }, + { MEM_MAJOR, /* urandom */ 9, DEV_CHAR, ACC_READ }, +}; + +static LIST_HEAD(default_perms); +#define parent_whitelist(p) (&default_perms) +static void prepare_def_perms(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(default_whitelist_items); i++) { + default_whitelist_items[i].access |= ACC_HIDDEN; + list_add(&default_whitelist_items[i].list, &default_perms); + } +} +#else +#define prepare_def_perms() do { } while(0) +#define parent_whitelist(p) (&parent_dev_cgroup->whitelist) +#endif + static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig) { struct dev_whitelist_item *wh, *tmp, *new; @@ -187,11 +231,13 @@ static struct cgroup_subsys_state *devcg wh->type = DEV_ALL; wh->access = ACC_MASK; list_add(&wh->list, &dev_cgroup->whitelist); + + prepare_def_perms(); } else { parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); mutex_lock(&devcgroup_mutex); ret = dev_whitelist_copy(&dev_cgroup->whitelist, - &parent_dev_cgroup->whitelist); + parent_whitelist(parent_dev_cgroup)); mutex_unlock(&devcgroup_mutex); if (ret) { kfree(dev_cgroup); @@ -266,8 +312,15 @@ static int devcgroup_seq_read(struct cgr set_access(acc, wh->access); set_majmin(maj, wh->major); set_majmin(min, wh->minor); - seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type), - maj, min, acc); + + if (cft != NULL) + seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type), + maj, min, acc); + else if (!(wh->access & ACC_HIDDEN)) + seq_printf(m, "%10u %c %03o %s:%s\n", + (unsigned)(unsigned long)m->private, + type_to_char(wh->type), + convert_bits(wh->access), maj, min); } rcu_read_unlock(); @@ -474,38 +527,35 @@ struct cgroup_subsys devices_subsys = { .subsys_id = devices_subsys_id, }; -int devcgroup_inode_permission(struct inode *inode, int mask) +static int __devcgroup_inode_permission(int blk, dev_t device, int mask) { struct dev_cgroup *dev_cgroup; struct dev_whitelist_item *wh; - dev_t device = inode->i_rdev; if (!device) return 0; - if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)) - return 0; rcu_read_lock(); - dev_cgroup = task_devcgroup(current); list_for_each_entry_rcu(wh, &dev_cgroup->whitelist, list) { if (wh->type & DEV_ALL) goto found; - if ((wh->type & DEV_BLOCK) && !S_ISBLK(inode->i_mode)) + if ((wh->type & DEV_BLOCK) && !blk) continue; - if ((wh->type & DEV_CHAR) && !S_ISCHR(inode->i_mode)) + if ((wh->type & DEV_CHAR) && blk) continue; - if (wh->major != ~0 && wh->major != imajor(inode)) + if (wh->major != ~0 && wh->major != MAJOR(device)) continue; - if (wh->minor != ~0 && wh->minor != iminor(inode)) + if (wh->minor != ~0 && wh->minor != MINOR(device)) continue; - +found: if ((mask & MAY_WRITE) && !(wh->access & ACC_WRITE)) continue; if ((mask & MAY_READ) && !(wh->access & ACC_READ)) continue; -found: + if ((mask & MAY_QUOTACTL) && !(wh->access & ACC_QUOTA)) + continue; rcu_read_unlock(); return 0; } @@ -515,6 +565,15 @@ found: return -EPERM; } +int devcgroup_inode_permission(struct inode *inode, int mask) +{ + if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)) + return 0; + + return __devcgroup_inode_permission(S_ISBLK(inode->i_mode), + inode->i_rdev, mask); +} + int devcgroup_inode_mknod(int mode, dev_t dev) { struct dev_cgroup *dev_cgroup; @@ -538,10 +597,9 @@ int devcgroup_inode_mknod(int mode, dev_ continue; if (wh->minor != ~0 && wh->minor != MINOR(dev)) continue; - +found: if (!(wh->access & ACC_MKNOD)) continue; -found: rcu_read_unlock(); return 0; } @@ -550,3 +608,75 @@ found: return -EPERM; } + +#ifdef CONFIG_VE +int get_device_perms_ve(int dev_type, dev_t dev, int access_mode) +{ + int mask = 0; + + mask |= (access_mode & FMODE_READ ? MAY_READ : 0); + mask |= (access_mode & FMODE_WRITE ? MAY_WRITE : 0); + mask |= (access_mode & FMODE_QUOTACTL ? MAY_QUOTACTL : 0); + + return __devcgroup_inode_permission(dev_type == S_IFBLK, dev, mask); +} +EXPORT_SYMBOL(get_device_perms_ve); + +int set_device_perms_ve(struct ve_struct *ve, + unsigned type, dev_t dev, unsigned mask) +{ + int err = -EINVAL; + struct dev_whitelist_item *new; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return -ENOMEM; + + if ((type & S_IFMT) == S_IFBLK) + new->type = DEV_BLOCK; + else if ((type & S_IFMT) == S_IFCHR) + new->type = DEV_CHAR; + else + goto out; + + new->access = convert_bits(mask); + new->major = new->minor = ~0; + + switch (type & VE_USE_MASK) { + default: + new->minor = MINOR(dev); + case VE_USE_MAJOR: + new->major = MAJOR(dev); + case 0: + ; + } + + err = dev_whitelist_add(cgroup_to_devcgroup(ve->ve_cgroup), new); +out: + if (err < 0) + kfree(new); + return err; +} +EXPORT_SYMBOL(set_device_perms_ve); + +#ifdef CONFIG_PROC_FS +int devperms_seq_show(struct seq_file *m, void *v) +{ + struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list); + + if (m->private == (void *)0) { + seq_printf(m, "Version: 2.7\n"); + m->private = (void *)-1; + } + + if (ve_is_super(ve)) { + seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0); + return 0; + } + + m->private = (void *)(unsigned long)ve->veid; + return devcgroup_seq_read(ve->ve_cgroup, NULL, m); +} +EXPORT_SYMBOL(devperms_seq_show); +#endif +#endif diff -urNp linux-2.6.32.48/security/Kconfig linux-2.6.32.48-openvz/security/Kconfig --- linux-2.6.32.48/security/Kconfig 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/security/Kconfig 2011-11-21 17:40:47.000000000 -0500 @@ -41,7 +41,7 @@ config KEYS_DEBUG_PROC_KEYS config SECURITY bool "Enable different security models" - depends on SYSFS + depends on SYSFS && !VE help This allows you to choose different security modules to be configured into your kernel. diff -urNp linux-2.6.32.48/security/selinux/Kconfig linux-2.6.32.48-openvz/security/selinux/Kconfig --- linux-2.6.32.48/security/selinux/Kconfig 2011-11-08 19:02:43.000000000 -0500 +++ linux-2.6.32.48-openvz/security/selinux/Kconfig 2011-11-21 17:40:47.000000000 -0500 @@ -1,6 +1,6 @@ config SECURITY_SELINUX bool "NSA SELinux Support" - depends on SECURITY_NETWORK && AUDIT && NET && INET + depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE select NETWORK_SECMARK default n help