diff -NurpX nopatch linux-2.4.36.6/Documentation/Configure.help linux-2.4.36.6-pax/Documentation/Configure.help --- linux-2.4.36.6/Documentation/Configure.help 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/Documentation/Configure.help 2008-05-08 12:41:32.000000000 +0200 @@ -29113,6 +29113,327 @@ CONFIG_SOUND_WM97XX If unsure, say N. +Support soft mode +CONFIG_PAX_SOFTMODE + Enabling this option will allow you to run PaX in soft mode, that + is, PaX features will not be enforced by default, only on executables + marked explicitly. You must also enable PT_PAX_FLAGS support as it + is the only way to mark executables for soft mode use. + + Soft mode can be activated by using the "pax_softmode=1" kernel command + line option on boot. Furthermore you can control various PaX features + at runtime via the entries in /proc/sys/kernel/pax. + +Use legacy ELF header marking +CONFIG_PAX_EI_PAX + Enabling this option will allow you to control PaX features on + a per executable basis via the 'chpax' utility available at + http://pax.grsecurity.net/. The control flags will be read from + an otherwise reserved part of the ELF header. This marking has + numerous drawbacks (no support for soft-mode, toolchain does not + know about the non-standard use of the ELF header) therefore it + has been deprecated in favour of PT_PAX_FLAGS support. + + If you have applications not marked by the PT_PAX_FLAGS ELF + program header then you MUST enable this option otherwise they + will not get any protection. + + Note that if you enable PT_PAX_FLAGS marking support as well, + the PT_PAX_FLAG marks will override the legacy EI_PAX marks. + +Use ELF program header marking +CONFIG_PAX_PT_PAX_FLAGS + Enabling this option will allow you to control PaX features on + a per executable basis via the 'paxctl' utility available at + http://pax.grsecurity.net/. The control flags will be read from + a PaX specific ELF program header (PT_PAX_FLAGS). This marking + has the benefits of supporting both soft mode and being fully + integrated into the toolchain (the binutils patch is available + from http://pax.grsecurity.net). + + If you have applications not marked by the PT_PAX_FLAGS ELF + program header then you MUST enable the EI_PAX marking support + otherwise they will not get any protection. + + Note that if you enable the legacy EI_PAX marking support as well, + the EI_PAX marks will be overridden by the PT_PAX_FLAGS marks. + +MAC system integration +CONFIG_PAX_NO_ACL_FLAGS + Mandatory Access Control systems have the option of controlling + PaX flags on a per executable basis, choose the method supported + by your particular system. + + - "none": if your MAC system does not interact with PaX, + - "direct": if your MAC system defines pax_set_initial_flags() itself, + - "hook": if your MAC system uses the pax_set_initial_flags_func callback. + + NOTE: this option is for developers/integrators only. + +Enforce non-executable pages +CONFIG_PAX_NOEXEC + By design some architectures do not allow for protecting memory + pages against execution or even if they do, Linux does not make + use of this feature. In practice this means that if a page is + readable (such as the stack or heap) it is also executable. + + There is a well known exploit technique that makes use of this + fact and a common programming mistake where an attacker can + introduce code of his choice somewhere in the attacked program's + memory (typically the stack or the heap) and then execute it. + + If the attacked program was running with different (typically + higher) privileges than that of the attacker, then he can elevate + his own privilege level (e.g. get a root shell, write to files for + which he does not have write access to, etc). + + Enabling this option will let you choose from various features + that prevent the injection and execution of 'foreign' code in + a program. + + This will also break programs that rely on the old behaviour and + expect that dynamically allocated memory via the malloc() family + of functions is executable (which it is not). Notable examples + are the XFree86 4.x server, the java runtime and wine. + +Paging based non-executable pages +CONFIG_PAX_PAGEEXEC + This implementation is based on the paging feature of the CPU. + On i386 it has a variable performance impact on applications + depending on their memory usage pattern. You should carefully + test your applications before using this feature in production. + On alpha, ia64, parisc, sparc, sparc64 and x86_64 there is no + performance impact. On ppc there is a slight performance impact. + +Segmentation based non-executable pages +CONFIG_PAX_SEGMEXEC + This implementation is based on the segmentation feature of the + CPU and has little performance impact, however applications will + be limited to a 1.5 GB address space instead of the normal 3 GB. + +Emulate trampolines +CONFIG_PAX_EMUTRAMP + There are some programs and libraries that for one reason or + another attempt to execute special small code snippets from + non-executable memory pages. Most notable examples are the + signal handler return code generated by the kernel itself and + the GCC trampolines. + + If you enabled CONFIG_PAX_PAGEEXEC or CONFIG_PAX_SEGMEXEC then + such programs will no longer work under your kernel. + + As a remedy you can say Y here and use the 'chpax' or 'paxctl' + utilities to enable trampoline emulation for the affected programs + yet still have the protection provided by the non-executable pages. + + On parisc and ppc you MUST enable this option and EMUSIGRT as + well, otherwise your system will not even boot. + + Alternatively you can say N here and use the 'chpax' or 'paxctl' + utilities to disable CONFIG_PAX_PAGEEXEC and CONFIG_PAX_SEGMEXEC + for the affected files. + + NOTE: enabling this feature *may* open up a loophole in the + protection provided by non-executable pages that an attacker + could abuse. Therefore the best solution is to not have any + files on your system that would require this option. This can + be achieved by not using libc5 (which relies on the kernel + signal handler return code) and not using or rewriting programs + that make use of the nested function implementation of GCC. + Skilled users can just fix GCC itself so that it implements + nested function calls in a way that does not interfere with PaX. + +Automatically emulate sigreturn trampolines +CONFIG_PAX_EMUSIGRT + Enabling this option will have the kernel automatically detect + and emulate signal return trampolines executing on the stack + that would otherwise lead to task termination. + + This solution is intended as a temporary one for users with + legacy versions of libc (libc5, glibc 2.0, uClibc before 0.9.17, + Modula-3 runtime, etc) or executables linked to such, basically + everything that does not specify its own SA_RESTORER function in + normal executable memory like glibc 2.1+ does. + + On parisc and ppc you MUST enable this option, otherwise your + system will not even boot. + + NOTE: this feature cannot be disabled on a per executable basis + and since it *does* open up a loophole in the protection provided + by non-executable pages, the best solution is to not have any + files on your system that would require this option. + +Restrict mprotect() +CONFIG_PAX_MPROTECT + Enabling this option will prevent programs from + - changing the executable status of memory pages that were + not originally created as executable, + - making read-only executable pages writable again, + - creating executable pages from anonymous memory. + + You should say Y here to complete the protection provided by + the enforcement of non-executable pages. + + NOTE: you can use the 'chpax' or 'paxctl' utilities to control + this feature on a per file basis. + +Disallow ELF text relocations +CONFIG_PAX_NOELFRELOCS + Non-executable pages and mprotect() restrictions are effective + in preventing the introduction of new executable code into an + attacked task's address space. There remain only two venues + for this kind of attack: if the attacker can execute already + existing code in the attacked task then he can either have it + create and mmap() a file containing his code or have it mmap() + an already existing ELF library that does not have position + independent code in it and use mprotect() on it to make it + writable and copy his code there. While protecting against + the former approach is beyond PaX, the latter can be prevented + by having only PIC ELF libraries on one's system (which do not + need to relocate their code). If you are sure this is your case, + then enable this option otherwise be careful as you may not even + be able to boot or log on your system (for example, some PAM + modules are erroneously compiled as non-PIC by default). + + NOTE: if you are using dynamic ELF executables (as suggested + when using ASLR) then you must have made sure that you linked + your files using the PIC version of crt1 (the et_dyn.tar.gz package + referenced there has already been updated to support this). + +Allow ELF ET_EXEC text relocations +CONFIG_PAX_ETEXECRELOCS + On some architectures there are incorrectly created applications + that require text relocations and would not work without enabling + this option. If you are an alpha, ia64 or parisc user, you should + enable this option and disable it once you have made sure that + none of your applications need it. + +Automatically emulate ELF PLT +CONFIG_PAX_EMUPLT + Enabling this option will have the kernel automatically detect + and emulate the Procedure Linkage Table entries in ELF files. + On some architectures such entries are in writable memory, and + become non-executable leading to task termination. Therefore + it is mandatory that you enable this option on alpha, parisc, ppc, + sparc and sparc64, otherwise your system would not even boot. + + NOTE: this feature *does* open up a loophole in the protection + provided by the non-executable pages, therefore the proper + solution is to modify the toolchain to produce a PLT that does + not need to be writable. + +Enforce non-executable kernel pages +CONFIG_PAX_KERNEXEC + This is the kernel land equivalent of PAGEEXEC and MPROTECT, + that is, enabling this option will make it harder to inject + and execute 'foreign' code in kernel memory itself. + +Address Space Layout Randomization +CONFIG_PAX_ASLR + Many if not most exploit techniques rely on the knowledge of + certain addresses in the attacked program. The following options + will allow the kernel to apply a certain amount of randomization + to specific parts of the program thereby forcing an attacker to + guess them in most cases. Any failed guess will most likely crash + the attacked program which allows the kernel to detect such attempts + and react on them. PaX itself provides no reaction mechanisms, + instead it is strongly encouraged that you make use of Nergal's + segvguard (ftp://ftp.pl.openwall.com/misc/segvguard/) or grsecurity's + (http://www.grsecurity.net/) built-in crash detection features or + develop one yourself. + + By saying Y here you can choose to randomize the following areas: + - top of the task's kernel stack + - top of the task's userland stack + - base address for mmap() requests that do not specify one + (this includes all libraries) + - base address of the main executable + + It is strongly recommended to say Y here as address space layout + randomization has negligible impact on performance yet it provides + a very effective protection. + + NOTE: you can use the 'chpax' or 'paxctl' utilities to control most + of these features on a per file basis. + +Randomize kernel stack base +CONFIG_PAX_RANDKSTACK + By saying Y here the kernel will randomize every task's kernel + stack on every system call. This will not only force an attacker + to guess it but also prevent him from making use of possible + leaked information about it. + + Since the kernel stack is a rather scarce resource, randomization + may cause unexpected stack overflows, therefore you should very + carefully test your system. Note that once enabled in the kernel + configuration, this feature cannot be disabled on a per file basis. + +Randomize user stack base +CONFIG_PAX_RANDUSTACK + By saying Y here the kernel will randomize every task's userland + stack. The randomization is done in two steps where the second + one may apply a big amount of shift to the top of the stack and + cause problems for programs that want to use lots of memory (more + than 2.5 GB if SEGMEXEC is not active, or 1.25 GB when it is). + For this reason the second step can be controlled by 'chpax' or + 'paxctl' on a per file basis. + +Randomize mmap() base +CONFIG_PAX_RANDMMAP + By saying Y here the kernel will use a randomized base address for + mmap() requests that do not specify one themselves. As a result + all dynamically loaded libraries will appear at random addresses + and therefore be harder to exploit by a technique where an attacker + attempts to execute library code for his purposes (e.g. spawn a + shell from an exploited program that is running at an elevated + privilege level). + + Furthermore, if a program is relinked as a dynamic ELF file, its + base address will be randomized as well, completing the full + randomization of the address space layout. Attacking such programs + becomes a guess game. You can find an example of doing this at + http://pax.grsecurity.net/et_dyn.tar.gz and practical samples + at http://www.grsecurity.net/grsec-gcc-specs.tar.gz . + + NOTE: you can use the 'chpax' or 'paxctl' utilities to control this + feature on a per file basis. + +Sanitize all freed memory +CONFIG_PAX_MEMORY_SANITIZE + By saying Y here the kernel will erase memory pages as soon as they + are freed. This in turn reduces the lifetime of data stored in the + pages, making it less likely that sensitive information such as + passwords, cryptographic secrets, etc stay in memory for too long. + + This is especially useful for programs whose runtime is short, long + lived processes and the kernel itself benefit from this as long as + they operate on whole memory pages and ensure timely freeing of pages + that may hold sensitive information. + + The tradeoff is performance impact, on a single CPU system kernel + compilation sees a 3% slowdown, other systems and workloads may vary + and you are advised to test this feature on your expected workload + before deploying it. + + Note that this feature does not protect data stored in live pages, + e.g., process memory swapped to disk may stay there for a long time. + +Prevent invalid userland pointer dereference +CONFIG_PAX_MEMORY_UDEREF + By saying Y here the kernel will be prevented from dereferencing + userland pointers in contexts where the kernel expects only kernel + pointers. This is both a useful runtime debugging feature and a + security measure that prevents exploiting a class of kernel bugs. + + The tradeoff is that some virtualization solutions may experience + a huge slowdown and therefore you should not enable this feature + for kernels meant to run in such environments. Whether a given VM + solution is affected or not is best determined by simply trying it + out, the performance impact will be obvious right on boot as this + mechanism engages from very early on. A good rule of thumb is that + VMs running on CPUs without hardware virtualization support (i.e., + the majority of IA-32 CPUs) will likely experience the slowdown. + # # A couple of things I keep forgetting: # capitalize: AppleTalk, Ethernet, DOS, DMA, FAT, FTP, Internet, diff -NurpX nopatch linux-2.4.36.6/Makefile linux-2.4.36.6-pax/Makefile --- linux-2.4.36.6/Makefile 2008-06-06 22:21:41.000000000 +0200 +++ linux-2.4.36.6-pax/Makefile 2008-06-06 22:21:58.000000000 +0200 @@ -382,6 +382,11 @@ init/do_mounts.o: init/do_mounts.c inclu fs lib mm ipc kernel drivers net: dummy $(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" $(subst $@, _dir_$@, $@) +cscope: + find include -type d \( -name "asm-*" -o -name config \) -prune -o -name '*.h' -print > cscope.files + find kernel drivers mm fs net ipc lib crypto init arch/${ARCH} include/asm-$(ARCH) include/asm-generic -name '*.[chS]' >> cscope.files + cscope -k -b -q < cscope.files + TAGS: dummy { find include/asm-${ARCH} -name '*.h' -print ; \ find include -type d \( -name "asm-*" -o -name config \) -prune -o -name '*.h' -print ; \ diff -NurpX nopatch linux-2.4.36.6/arch/alpha/config.in linux-2.4.36.6-pax/arch/alpha/config.in --- linux-2.4.36.6/arch/alpha/config.in 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/alpha/config.in 2008-05-08 12:41:32.000000000 +0200 @@ -466,5 +466,65 @@ int 'Kernel messages buffer length shift endmenu +mainmenu_option next_comment +comment 'PaX options' + +mainmenu_option next_comment +comment 'PaX Control' +bool 'Support soft mode' CONFIG_PAX_SOFTMODE +bool 'Use legacy ELF header marking' CONFIG_PAX_EI_PAX +bool 'Use ELF program header marking' CONFIG_PAX_PT_PAX_FLAGS +choice 'MAC system integration' \ + "none CONFIG_PAX_NO_ACL_FLAGS \ + direct CONFIG_PAX_HAVE_ACL_FLAGS \ + hook CONFIG_PAX_HOOK_ACL_FLAGS" none +endmenu + +mainmenu_option next_comment +comment 'Non-executable pages' +if [ "$CONFIG_PAX_EI_PAX" = "y" -o \ + "$CONFIG_PAX_PT_PAX_FLAGS" = "y" -o \ + "$CONFIG_PAX_HAVE_ACL_FLAGS" = "y" -o \ + "$CONFIG_PAX_HOOK_ACL_FLAGS" = "y" ]; then + bool 'Enforce non-executable pages' CONFIG_PAX_NOEXEC + if [ "$CONFIG_PAX_NOEXEC" = "y" ]; then + bool 'Paging based non-executable pages' CONFIG_PAX_PAGEEXEC + if [ "$CONFIG_PAX_PAGEEXEC" = "y" ]; then +# bool ' Emulate trampolines' CONFIG_PAX_EMUTRAMP +# if [ "$CONFIG_PAX_EMUTRAMP" = "y" ]; then +# bool ' Automatically emulate sigreturn trampolines' CONFIG_PAX_EMUSIGRT +# fi + bool ' Restrict mprotect()' CONFIG_PAX_MPROTECT + if [ "$CONFIG_PAX_MPROTECT" = "y" ]; then +# bool ' Disallow ELF text relocations' CONFIG_PAX_NOELFRELOCS + bool ' Automatically emulate ELF PLT' CONFIG_PAX_EMUPLT + bool ' Allow ELF ET_EXEC text relocations' CONFIG_PAX_ETEXECRELOCS + fi + fi + fi +fi +endmenu + +mainmenu_option next_comment +comment 'Address Space Layout Randomization' +if [ "$CONFIG_PAX_EI_PAX" = "y" -o \ + "$CONFIG_PAX_PT_PAX_FLAGS" = "y" -o \ + "$CONFIG_PAX_HAVE_ACL_FLAGS" = "y" -o \ + "$CONFIG_PAX_HOOK_ACL_FLAGS" = "y" ]; then + bool 'Address Space Layout Randomization' CONFIG_PAX_ASLR + if [ "$CONFIG_PAX_ASLR" = "y" ]; then + bool ' Randomize user stack base' CONFIG_PAX_RANDUSTACK + bool ' Randomize mmap() base' CONFIG_PAX_RANDMMAP + fi +fi +endmenu + +mainmenu_option next_comment +comment 'Miscellaneous hardening features' +bool 'Sanitize all freed memory' CONFIG_PAX_MEMORY_SANITIZE +endmenu + +endmenu + source crypto/Config.in source lib/Config.in diff -NurpX nopatch linux-2.4.36.6/arch/alpha/kernel/osf_sys.c linux-2.4.36.6-pax/arch/alpha/kernel/osf_sys.c --- linux-2.4.36.6/arch/alpha/kernel/osf_sys.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/alpha/kernel/osf_sys.c 2008-05-08 12:41:32.000000000 +0200 @@ -1357,6 +1357,10 @@ arch_get_unmapped_area(struct file *filp merely specific addresses, but regions of memory -- perhaps this feature should be incorporated into all ports? */ +#ifdef CONFIG_PAX_RANDMMAP + if (!(current->mm->pax_flags & MF_PAX_RANDMMAP) || !filp) +#endif + if (addr) { addr = arch_get_unmapped_area_1 (PAGE_ALIGN(addr), len, limit); if (addr != -ENOMEM) @@ -1364,8 +1368,15 @@ arch_get_unmapped_area(struct file *filp } /* Next, try allocating at TASK_UNMAPPED_BASE. */ - addr = arch_get_unmapped_area_1 (PAGE_ALIGN(TASK_UNMAPPED_BASE), - len, limit); + + addr = TASK_UNMAPPED_BASE; + +#ifdef CONFIG_PAX_RANDMMAP + if (current->mm->pax_flags & MF_PAX_RANDMMAP) + addr += current->mm->delta_mmap; +#endif + + addr = arch_get_unmapped_area_1 (PAGE_ALIGN(addr), len, limit); if (addr != -ENOMEM) return addr; diff -NurpX nopatch linux-2.4.36.6/arch/alpha/mm/fault.c linux-2.4.36.6-pax/arch/alpha/mm/fault.c --- linux-2.4.36.6/arch/alpha/mm/fault.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/alpha/mm/fault.c 2008-05-08 12:41:32.000000000 +0200 @@ -53,6 +53,123 @@ __load_new_mm_context(struct mm_struct * __reload_thread(¤t->thread); } +#ifdef CONFIG_PAX_PAGEEXEC +/* + * PaX: decide what to do with offenders (regs->pc = fault address) + * + * returns 1 when task should be killed + * 2 when patched PLT trampoline was detected + * 3 when unpatched PLT trampoline was detected + */ +static int pax_handle_fetch_fault(struct pt_regs *regs) +{ + int err; + +#ifdef CONFIG_PAX_EMUPLT + do { /* PaX: patched PLT emulation #1 */ + unsigned int ldah, ldq, jmp; + + err = get_user(ldah, (unsigned int *)regs->pc); + err |= get_user(ldq, (unsigned int *)(regs->pc+4)); + err |= get_user(jmp, (unsigned int *)(regs->pc+8)); + + if (err) + break; + + if ((ldah & 0xFFFF0000U) == 0x277B0000U && + (ldq & 0xFFFF0000U) == 0xA77B0000U && + jmp == 0x6BFB0000U) + { + unsigned long r27, addr; + unsigned long addrh = (ldah | 0xFFFFFFFFFFFF0000UL) << 16; + unsigned long addrl = ldq | 0xFFFFFFFFFFFF0000UL; + + addr = regs->r27 + ((addrh ^ 0x80000000UL) + 0x80000000UL) + ((addrl ^ 0x8000UL) + 0x8000UL); + err = get_user(r27, (unsigned long*)addr); + if (err) + break; + + regs->r27 = r27; + regs->pc = r27; + return 2; + } + } while (0); + + do { /* PaX: patched PLT emulation #2 */ + unsigned int ldah, lda, br; + + err = get_user(ldah, (unsigned int *)regs->pc); + err |= get_user(lda, (unsigned int *)(regs->pc+4)); + err |= get_user(br, (unsigned int *)(regs->pc+8)); + + if (err) + break; + + if ((ldah & 0xFFFF0000U) == 0x277B0000U && + (lda & 0xFFFF0000U) == 0xA77B0000U && + (br & 0xFFE00000U) == 0xC3E00000U) + { + unsigned long addr = br | 0xFFFFFFFFFFE00000UL; + unsigned long addrh = (ldah | 0xFFFFFFFFFFFF0000UL) << 16; + unsigned long addrl = lda | 0xFFFFFFFFFFFF0000UL; + + regs->r27 += ((addrh ^ 0x80000000UL) + 0x80000000UL) + ((addrl ^ 0x8000UL) + 0x8000UL); + regs->pc += 12 + (((addr ^ 0x00100000UL) + 0x00100000UL) << 2); + return 2; + } + } while (0); + + do { /* PaX: unpatched PLT emulation */ + unsigned int br; + + err = get_user(br, (unsigned int *)regs->pc); + + if (!err && (br & 0xFFE00000U) == 0xC3800000U) { + unsigned int br2, ldq, nop, jmp; + unsigned long addr = br | 0xFFFFFFFFFFE00000UL, resolver; + + addr = regs->pc + 4 + (((addr ^ 0x00100000UL) + 0x00100000UL) << 2); + err = get_user(br2, (unsigned int *)addr); + err |= get_user(ldq, (unsigned int *)(addr+4)); + err |= get_user(nop, (unsigned int *)(addr+8)); + err |= get_user(jmp, (unsigned int *)(addr+12)); + err |= get_user(resolver, (unsigned long *)(addr+16)); + + if (err) + break; + + if (br2 == 0xC3600000U && + ldq == 0xA77B000CU && + nop == 0x47FF041FU && + jmp == 0x6B7B0000U) + { + regs->r28 = regs->pc+4; + regs->r27 = addr+16; + regs->pc = resolver; + return 3; + } + } + } while (0); +#endif + + return 1; +} + +void pax_report_insns(void *pc, void *sp) +{ + unsigned long i; + + printk(KERN_ERR "PAX: bytes at PC: "); + for (i = 0; i < 5; i++) { + unsigned int c; + if (get_user(c, (unsigned int*)pc+i)) + printk("???????? "); + else + printk("%08x ", c); + } + printk("\n"); +} +#endif /* * This routine handles page faults. It determines the address, @@ -133,8 +250,29 @@ do_page_fault(unsigned long address, uns good_area: info.si_code = SEGV_ACCERR; if (cause < 0) { - if (!(vma->vm_flags & VM_EXEC)) + if (!(vma->vm_flags & VM_EXEC)) { + +#ifdef CONFIG_PAX_PAGEEXEC + if (!(mm->pax_flags & MF_PAX_PAGEEXEC) || address != regs->pc) + goto bad_area; + + up_read(&mm->mmap_sem); + switch(pax_handle_fetch_fault(regs)) { + +#ifdef CONFIG_PAX_EMUPLT + case 2: + case 3: + return; +#endif + + } + pax_report_fault(regs, (void*)regs->pc, (void*)rdusp()); + do_exit(SIGKILL); +#else goto bad_area; +#endif + + } } else if (!cause) { /* Allow reads even for write-only mappings */ if (!(vma->vm_flags & (VM_READ | VM_WRITE))) diff -NurpX nopatch linux-2.4.36.6/arch/i386/Makefile linux-2.4.36.6-pax/arch/i386/Makefile --- linux-2.4.36.6/arch/i386/Makefile 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/Makefile 2008-05-08 12:41:32.000000000 +0200 @@ -119,6 +119,9 @@ arch/i386/mm: dummy MAKEBOOT = $(MAKE) -C arch/$(ARCH)/boot +arch/i386/vmlinux.lds: arch/i386/vmlinux.lds.S FORCE + $(CPP) -C -P -I$(HPATH) -D__KERNEL__ -imacros $(HPATH)/linux/config.h -imacros $(HPATH)/asm-i386/segment.h -imacros $(HPATH)/asm-i386/page.h -Ui386 arch/i386/vmlinux.lds.S >arch/i386/vmlinux.lds + vmlinux: arch/i386/vmlinux.lds FORCE: ; @@ -155,6 +158,7 @@ archclean: @$(MAKEBOOT) clean archmrproper: + rm -f arch/i386/vmlinux.lds archdep: @$(MAKEBOOT) dep diff -NurpX nopatch linux-2.4.36.6/arch/i386/boot/bootsect.S linux-2.4.36.6-pax/arch/i386/boot/bootsect.S --- linux-2.4.36.6/arch/i386/boot/bootsect.S 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/boot/bootsect.S 2008-05-08 12:41:32.000000000 +0200 @@ -237,7 +237,7 @@ rp_read: #ifdef __BIG_KERNEL__ # look in setup.S for bootsect_kludge bootsect_kludge = 0x220 # 0x200 + 0x20 which is the size of the - lcall bootsect_kludge # bootsector + bootsect_kludge offset + lcall *bootsect_kludge # bootsector + bootsect_kludge offset #else movw %es, %ax subw $SYSSEG, %ax diff -NurpX nopatch linux-2.4.36.6/arch/i386/boot/compressed/head.S linux-2.4.36.6-pax/arch/i386/boot/compressed/head.S --- linux-2.4.36.6/arch/i386/boot/compressed/head.S 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/boot/compressed/head.S 2008-05-08 12:41:32.000000000 +0200 @@ -38,11 +38,13 @@ startup_32: movl %eax,%gs lss SYMBOL_NAME(stack_start),%esp + movl 0x000000,%ecx xorl %eax,%eax 1: incl %eax # check that A20 really IS enabled movl %eax,0x000000 # loop forever if it isn't cmpl %eax,0x100000 je 1b + movl %ecx,0x000000 /* * Initialize eflags. Some BIOS's leave bits like NT set. This would diff -NurpX nopatch linux-2.4.36.6/arch/i386/boot/setup.S linux-2.4.36.6-pax/arch/i386/boot/setup.S --- linux-2.4.36.6/arch/i386/boot/setup.S 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/boot/setup.S 2008-05-08 12:41:32.000000000 +0200 @@ -637,7 +637,7 @@ edd_done: cmpw $0, %cs:realmode_swtch jz rmodeswtch_normal - lcall %cs:realmode_swtch + lcall *%cs:realmode_swtch jmp rmodeswtch_end diff -NurpX nopatch linux-2.4.36.6/arch/i386/config.in linux-2.4.36.6-pax/arch/i386/config.in --- linux-2.4.36.6/arch/i386/config.in 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/config.in 2008-05-08 12:41:32.000000000 +0200 @@ -102,6 +102,7 @@ if [ "$CONFIG_M586MMX" = "y" ]; then fi if [ "$CONFIG_M686" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 5 + define_bool CONFIG_X86_ALIGNMENT_16 y define_bool CONFIG_X86_HAS_TSC y define_bool CONFIG_X86_GOOD_APIC y bool 'PGE extensions (not for Cyrix/Transmeta)' CONFIG_X86_PGE @@ -111,6 +112,7 @@ if [ "$CONFIG_M686" = "y" ]; then fi if [ "$CONFIG_MPENTIUMIII" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 5 + define_bool CONFIG_X86_ALIGNMENT_16 y define_bool CONFIG_X86_HAS_TSC y define_bool CONFIG_X86_GOOD_APIC y define_bool CONFIG_X86_PGE y @@ -119,6 +121,7 @@ if [ "$CONFIG_MPENTIUMIII" = "y" ]; then fi if [ "$CONFIG_MPENTIUM4" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 7 + define_bool CONFIG_X86_ALIGNMENT_16 y define_bool CONFIG_X86_HAS_TSC y define_bool CONFIG_X86_GOOD_APIC y define_bool CONFIG_X86_PGE y @@ -138,6 +141,7 @@ if [ "$CONFIG_MK8" = "y" ]; then fi if [ "$CONFIG_MK7" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 6 + define_bool CONFIG_X86_ALIGNMENT_16 y define_bool CONFIG_X86_HAS_TSC y define_bool CONFIG_X86_GOOD_APIC y define_bool CONFIG_X86_USE_3DNOW y @@ -491,5 +495,84 @@ int 'Kernel messages buffer length shift endmenu +mainmenu_option next_comment +comment 'PaX options' + +mainmenu_option next_comment +comment 'PaX Control' +bool 'Support soft mode' CONFIG_PAX_SOFTMODE +bool 'Use legacy ELF header marking' CONFIG_PAX_EI_PAX +bool 'Use ELF program header marking' CONFIG_PAX_PT_PAX_FLAGS +choice 'MAC system integration' \ + "none CONFIG_PAX_NO_ACL_FLAGS \ + direct CONFIG_PAX_HAVE_ACL_FLAGS \ + hook CONFIG_PAX_HOOK_ACL_FLAGS" none +endmenu + +mainmenu_option next_comment +comment 'Non-executable pages' +if [ "$CONFIG_PAX_EI_PAX" = "y" -o \ + "$CONFIG_PAX_PT_PAX_FLAGS" = "y" -o \ + "$CONFIG_PAX_HAVE_ACL_FLAGS" = "y" -o \ + "$CONFIG_PAX_HOOK_ACL_FLAGS" = "y" ]; then + bool 'Enforce non-executable pages' CONFIG_PAX_NOEXEC + if [ "$CONFIG_PAX_NOEXEC" = "y" ]; then + if [ "$CONFIG_M586" = "y" -o \ + "$CONFIG_M586TSC" = "y" -o \ + "$CONFIG_M586MMX" = "y" -o \ + "$CONFIG_M686" = "y" -o \ + "$CONFIG_MPENTIUMIII" = "y" -o \ + "$CONFIG_MPENTIUM4" = "y" -o \ + "$CONFIG_MK7" = "y" -o \ + "$CONFIG_MK8" = "y" -o \ + "$CONFIG_MWINCHIPC6" = "y" -o \ + "$CONFIG_MWINCHIP2" = "y" -o \ + "$CONFIG_MWINCHIP3D" = "y" -o \ + "$CONFIG_MVIAC3_2" = "y" ]; then + bool 'Paging based non-executable pages' CONFIG_PAX_PAGEEXEC + fi + bool 'Segmentation based non-executable pages' CONFIG_PAX_SEGMEXEC + if [ "$CONFIG_PAX_PAGEEXEC" = "y" -o "$CONFIG_PAX_SEGMEXEC" = "y" ]; then + bool ' Emulate trampolines' CONFIG_PAX_EMUTRAMP + if [ "$CONFIG_PAX_EMUTRAMP" = "y" ]; then + bool ' Automatically emulate sigreturn trampolines' CONFIG_PAX_EMUSIGRT + fi + bool ' Restrict mprotect()' CONFIG_PAX_MPROTECT + if [ "$CONFIG_PAX_MPROTECT" = "y" ]; then + bool ' Disallow ELF text relocations' CONFIG_PAX_NOELFRELOCS + fi + fi + if [ "$CONFIG_MODULES" != "y" -a "$CONFIG_X86_WP_WORKS_OK" = "y" ]; then + bool 'Enforce non-executable kernel pages' CONFIG_PAX_KERNEXEC + fi + fi +fi +endmenu + +mainmenu_option next_comment +comment 'Address Space Layout Randomization' +if [ "$CONFIG_PAX_EI_PAX" = "y" -o \ + "$CONFIG_PAX_PT_PAX_FLAGS" = "y" -o \ + "$CONFIG_PAX_HAVE_ACL_FLAGS" = "y" -o \ + "$CONFIG_PAX_HOOK_ACL_FLAGS" = "y" ]; then + bool 'Address Space Layout Randomization' CONFIG_PAX_ASLR + if [ "$CONFIG_PAX_ASLR" = "y" ]; then + if [ "$CONFIG_X86_TSC" = "y" ]; then + bool ' Randomize kernel stack base' CONFIG_PAX_RANDKSTACK + fi + bool ' Randomize user stack base' CONFIG_PAX_RANDUSTACK + bool ' Randomize mmap() base' CONFIG_PAX_RANDMMAP + fi +fi +endmenu + +mainmenu_option next_comment +comment 'Miscellaneous hardening features' +bool 'Sanitize all freed memory' CONFIG_PAX_MEMORY_SANITIZE +bool 'Prevent invalid userland pointer dereference' CONFIG_PAX_MEMORY_UDEREF +endmenu + +endmenu + source crypto/Config.in source lib/Config.in diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/acpi.c linux-2.4.36.6-pax/arch/i386/kernel/acpi.c --- linux-2.4.36.6/arch/i386/kernel/acpi.c 2008-05-08 12:39:21.000000000 +0200 +++ linux-2.4.36.6-pax/arch/i386/kernel/acpi.c 2008-05-08 12:41:32.000000000 +0200 @@ -370,7 +370,7 @@ acpi_scan_rsdp ( * RSDP signature. */ for (offset = 0; offset < length; offset += 16) { - if (strncmp((char *) (start + offset), "RSD PTR ", sig_len)) + if (strncmp((char *) (phys_to_virt(start) + offset), "RSD PTR ", sig_len)) continue; return (start + offset); } @@ -708,7 +708,7 @@ static void acpi_create_identity_pmd (vo saved_pmd = *pmd; /* set the new one */ - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(ptep))); + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(ptep))); /* flush the TLB */ local_flush_tlb(); diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/apm.c linux-2.4.36.6-pax/arch/i386/kernel/apm.c --- linux-2.4.36.6/arch/i386/kernel/apm.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/apm.c 2008-05-08 12:41:32.000000000 +0200 @@ -223,7 +223,7 @@ #include extern unsigned long get_cmos_time(void); -extern void machine_real_restart(unsigned char *, int); +extern void machine_real_restart(const unsigned char *, unsigned int); #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -614,7 +614,7 @@ static u8 apm_bios_call(u32 func, u32 eb __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" - "lcall %%cs:" SYMBOL_NAME_STR(apm_bios_entry) "\n\t" + "lcall *%%ss:" SYMBOL_NAME_STR(apm_bios_entry) "\n\t" "setc %%al\n\t" "popl %%ebp\n\t" "popl %%edi\n\t" @@ -666,7 +666,7 @@ static u8 apm_bios_call_simple(u32 func, __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" - "lcall %%cs:" SYMBOL_NAME_STR(apm_bios_entry) "\n\t" + "lcall *%%ss:" SYMBOL_NAME_STR(apm_bios_entry) "\n\t" "setc %%bl\n\t" "popl %%ebp\n\t" "popl %%edi\n\t" @@ -924,7 +924,7 @@ recalc: static void apm_power_off(void) { - unsigned char po_bios_call[] = { + const unsigned char po_bios_call[] = { 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ 0x8e, 0xd0, /* movw ax,ss */ 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ @@ -1985,6 +1985,12 @@ static int __init apm_init(void) __va((unsigned long)0x40 << 4)); _set_limit((char *)&gdt[APM_40 >> 3], 4095 - (0x40 << 4)); +#ifdef CONFIG_PAX_SEGMEXEC + set_base(gdt2[APM_40 >> 3], + __va((unsigned long)0x40 << 4)); + _set_limit((char *)&gdt2[APM_40 >> 3], 4095 - (0x40 << 4)); +#endif + apm_bios_entry.offset = apm_info.bios.offset; apm_bios_entry.segment = APM_CS; set_base(gdt[APM_CS >> 3], @@ -1993,6 +1999,16 @@ static int __init apm_init(void) __va((unsigned long)apm_info.bios.cseg_16 << 4)); set_base(gdt[APM_DS >> 3], __va((unsigned long)apm_info.bios.dseg << 4)); + +#ifdef CONFIG_PAX_SEGMEXEC + set_base(gdt2[APM_CS >> 3], + __va((unsigned long)apm_info.bios.cseg << 4)); + set_base(gdt2[APM_CS_16 >> 3], + __va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_base(gdt2[APM_DS >> 3], + __va((unsigned long)apm_info.bios.dseg << 4)); +#endif + #ifndef APM_RELAX_SEGMENTS if (apm_info.bios.version == 0x100) { #endif @@ -2002,6 +2018,13 @@ static int __init apm_init(void) _set_limit((char *)&gdt[APM_CS_16 >> 3], 64 * 1024 - 1); /* For the DEC Hinote Ultra CT475 (and others?) */ _set_limit((char *)&gdt[APM_DS >> 3], 64 * 1024 - 1); + +#ifdef CONFIG_PAX_SEGMEXEC + _set_limit((char *)&gdt2[APM_CS >> 3], 64 * 1024 - 1); + _set_limit((char *)&gdt2[APM_CS_16 >> 3], 64 * 1024 - 1); + _set_limit((char *)&gdt2[APM_DS >> 3], 64 * 1024 - 1); +#endif + #ifndef APM_RELAX_SEGMENTS } else { _set_limit((char *)&gdt[APM_CS >> 3], @@ -2010,6 +2033,16 @@ static int __init apm_init(void) (apm_info.bios.cseg_16_len - 1) & 0xffff); _set_limit((char *)&gdt[APM_DS >> 3], (apm_info.bios.dseg_len - 1) & 0xffff); + +#ifdef CONFIG_PAX_SEGMEXEC + _set_limit((char *)&gdt2[APM_CS >> 3], + (apm_info.bios.cseg_len - 1) & 0xffff); + _set_limit((char *)&gdt2[APM_CS_16 >> 3], + (apm_info.bios.cseg_16_len - 1) & 0xffff); + _set_limit((char *)&gdt2[APM_DS >> 3], + (apm_info.bios.dseg_len - 1) & 0xffff); +#endif + } #endif diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/entry.S linux-2.4.36.6-pax/arch/i386/kernel/entry.S --- linux-2.4.36.6/arch/i386/kernel/entry.S 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/entry.S 2008-05-08 12:41:32.000000000 +0200 @@ -84,7 +84,7 @@ processor = 52 ENOSYS = 38 -#define SAVE_ALL \ +#define __SAVE_ALL \ cld; \ pushl %es; \ pushl %ds; \ @@ -99,6 +99,18 @@ ENOSYS = 38 movl %edx,%ds; \ movl %edx,%es; +#ifdef CONFIG_PAX_KERNEXEC +#define SAVE_ALL \ + __SAVE_ALL \ + movl %cr0,%edx; \ + movl %edx,%ebp; \ + orl $0x10000,%edx; \ + xorl %edx,%ebp; \ + movl %edx,%cr0; +#else +#define SAVE_ALL __SAVE_ALL +#endif + #define RESTORE_ALL \ popl %ebx; \ popl %ecx; \ @@ -209,6 +221,17 @@ ENTRY(system_call) jae badsys call *SYMBOL_NAME(sys_call_table)(,%eax,4) movl %eax,EAX(%esp) # save the return value + +#ifdef CONFIG_PAX_RANDKSTACK + cli # need_resched and signals atomic test + cmpl $0,need_resched(%ebx) + jne reschedule + cmpl $0,sigpending(%ebx) + jne signal_return + call SYMBOL_NAME(pax_randomize_kstack) + jmp restore_all +#endif + ENTRY(ret_from_sys_call) cli # need_resched and signals atomic test cmpl $0,need_resched(%ebx) @@ -260,6 +283,13 @@ ret_from_exception: movb CS(%esp),%al testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor? jne ret_from_sys_call + +#ifdef CONFIG_PAX_KERNEXEC + movl %cr0, %edx + xorl %ebp, %edx + movl %edx, %cr0 +#endif + jmp restore_all ALIGN @@ -283,6 +313,15 @@ error_code: pushl %ecx pushl %ebx cld + +#ifdef CONFIG_PAX_KERNEXEC + movl %cr0,%edx + movl %edx,%ebp + orl $0x10000,%edx + xorl %edx,%ebp + movl %edx,%cr0 +#endif + movl %es,%ecx movl ORIG_EAX(%esp), %esi # get the error code movl ES(%esp), %edi # get the function address @@ -337,6 +376,13 @@ ENTRY(nmi) pushl %edx call SYMBOL_NAME(do_nmi) addl $8,%esp + +#ifdef CONFIG_PAX_KERNEXEC + movl %cr0, %edx + xorl %ebp, %edx + movl %edx, %cr0 +#endif + RESTORE_ALL ENTRY(int3) @@ -389,8 +435,77 @@ ENTRY(alignment_check) jmp error_code ENTRY(page_fault) +#ifdef CONFIG_PAX_PAGEEXEC + ALIGN + pushl $ SYMBOL_NAME(pax_do_page_fault) +#else pushl $ SYMBOL_NAME(do_page_fault) +#endif + +#ifndef CONFIG_PAX_EMUTRAMP jmp error_code +#else + pushl %ds + pushl %eax + xorl %eax,%eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + decl %eax # eax = -1 + pushl %ecx + pushl %ebx + cld + +#ifdef CONFIG_PAX_KERNEXEC + movl %cr0,%edx + movl %edx,%ebp + orl $0x10000,%edx + xorl %edx,%ebp + movl %edx,%cr0 +#endif + + movl %es,%ecx + movl ORIG_EAX(%esp), %esi # get the error code + movl ES(%esp), %edi # get the function address + movl %eax, ORIG_EAX(%esp) + movl %ecx, ES(%esp) + movl %esp,%edx + pushl %esi # push the error code + pushl %edx # push the pt_regs pointer + movl $(__KERNEL_DS),%edx + movl %edx,%ds + movl %edx,%es + GET_CURRENT(%ebx) + call *%edi + addl $8,%esp + decl %eax + jnz ret_from_exception + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax +1: popl %ds; +2: popl %es; + addl $4,%esp; + jmp system_call + +.section .fixup,"ax"; +3: movl $0,(%esp); + jmp 1b; +4: movl $0,(%esp); + jmp 2b; +.previous; +.section __ex_table,"a"; + .align 4; + .long 1b,3b; + .long 2b,4b; +.previous +#endif ENTRY(machine_check) pushl $0 @@ -402,7 +517,7 @@ ENTRY(spurious_interrupt_bug) pushl $ SYMBOL_NAME(do_spurious_interrupt_bug) jmp error_code -.data +.section .rodata,"a",@progbits ENTRY(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* 0 - old "setup()" system call*/ .long SYMBOL_NAME(sys_exit) diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/head.S linux-2.4.36.6-pax/arch/i386/kernel/head.S --- linux-2.4.36.6/arch/i386/kernel/head.S 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/head.S 2008-05-08 12:41:32.000000000 +0200 @@ -36,11 +36,23 @@ #define X86_CAPABILITY CPU_PARAMS+12 #define X86_VENDOR_ID CPU_PARAMS+36 /* tied to NCAPINTS in cpufeature.h */ +#ifdef CONFIG_PAX_KERNEXEC +/* PaX: fill first page in .text with int3 to catch NULL derefs in kernel mode */ +.fill 4096,1,0xcc +#endif + +/* + * Real beginning of normal "text" segment + */ +ENTRY(stext) +ENTRY(_stext) + /* * swapper_pg_dir is the main page directory, address 0x00101000 * * On entry, %esi points to the real-mode code as a 32-bit pointer. */ +.global startup_32 startup_32: /* * Set segments to known values @@ -51,9 +63,88 @@ startup_32: movl %eax,%es movl %eax,%fs movl %eax,%gs + movl %eax,%ss + #ifdef CONFIG_SMP orw %bx,%bx - jz 1f + jnz 1f +#endif + +#ifdef CONFIG_PAX_MEMORY_UDEREF + /* check for VMware */ + movl $0x564d5868,%eax + xorl %ebx,%ebx + movl $0xa,%ecx + movl $0x5658,%edx + in (%dx),%eax + cmpl $0x564d5868,%ebx + jz 2f + + movl $((((__PAGE_OFFSET-1) & 0xf0000000) >> 12) | 0x00c09700),%eax + movl %eax,(SYMBOL_NAME(gdt_table) - __PAGE_OFFSET + __KERNEL_DS + 4) + +#ifdef CONFIG_PAX_SEGMEXEC + movl %eax,(SYMBOL_NAME(gdt_table2) - __PAGE_OFFSET + __KERNEL_DS + 4) +#endif + +2: +#endif + +#ifdef CONFIG_PAX_KERNEXEC + movl $__KERNEL_TEXT_OFFSET,%eax + movw %ax,(SYMBOL_NAME(gdt_table) + __KERNEL_CS + 2 - __PAGE_OFFSET) + rorl $16,%eax + movb %al,(SYMBOL_NAME(gdt_table) + __KERNEL_CS + 4 - __PAGE_OFFSET) + movb %ah,(SYMBOL_NAME(gdt_table) + __KERNEL_CS + 7 - __PAGE_OFFSET) + +#ifdef CONFIG_PAX_SEGMEXEC + movb %al,(SYMBOL_NAME(gdt_table2) + __KERNEL_CS + 4 - __PAGE_OFFSET) + movb %ah,(SYMBOL_NAME(gdt_table2) + __KERNEL_CS + 7 - __PAGE_OFFSET) + rorl $16,%eax + movw %ax,(SYMBOL_NAME(gdt_table2) + __KERNEL_CS + 2 - __PAGE_OFFSET) +#endif + +#endif + +/* + * Clear BSS first so that there are no surprises... + * No need to cld as DF is already clear from cld above... + */ + xorl %eax,%eax + movl $ SYMBOL_NAME(__bss_start) - __PAGE_OFFSET,%edi + movl $ SYMBOL_NAME(__bss_end) - __PAGE_OFFSET,%ecx + subl %edi,%ecx + rep + stosb +/* + * Copy bootup parameters out of the way. First 2kB of + * _empty_zero_page is for boot parameters, second 2kB + * is for the command line. + * + * Note: %esi still has the pointer to the real-mode data. + */ + movl $ SYMBOL_NAME(empty_zero_page) - __PAGE_OFFSET,%edi + movl $512,%ecx + cld + rep + movsl + xorl %eax,%eax + movl $512,%ecx + rep + stosl + movl SYMBOL_NAME(empty_zero_page) - __PAGE_OFFSET + NEW_CL_POINTER,%esi + andl %esi,%esi + jnz 2f # New command line protocol + cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR + jne 1f + movzwl OLD_CL_OFFSET,%esi + addl $(OLD_CL_BASE_ADDR),%esi +2: + movl $ SYMBOL_NAME(empty_zero_page) - __PAGE_OFFSET + 2048,%edi + movl $512,%ecx + rep + movsl +1: /* * New page tables may be in 4Mbyte page mode and may @@ -71,22 +162,28 @@ startup_32: */ #define cr4_bits mmu_cr4_features-__PAGE_OFFSET cmpl $0,cr4_bits - je 3f + je 1f movl %cr4,%eax # Turn on paging options (PSE,PAE,..) orl cr4_bits,%eax movl %eax,%cr4 - jmp 3f 1: + +#ifdef CONFIG_SMP + orw %bx,%bx + jnz 3f #endif + /* * Initialize page tables */ movl $pg0-__PAGE_OFFSET,%edi /* initialize page tables */ - movl $007,%eax /* "007" doesn't mean with right to kill, but - PRESENT+RW+USER */ + movl $0x63,%eax /* "0x63" is PRESENT+RW+ACCESSED+DIRTY */ 2: stosl +#ifdef CONFIG_X86_PAE + addl $4,%edi +#endif add $0x1000,%eax - cmp $empty_zero_page-__PAGE_OFFSET,%edi + cmp $0x01000063,%eax jne 2b /* @@ -100,37 +197,16 @@ startup_32: movl %eax,%cr0 /* ..and set paging (PG) bit */ jmp 1f /* flush the prefetch-queue */ 1: - movl $1f,%eax - jmp *%eax /* make sure eip is relocated */ -1: + lgdt gdt_descr + ljmp $__KERNEL_CS,$1f +1: movl $(__KERNEL_DS),%eax # reload all the segment registers + movl %eax,%ds # after changing gdt. + movl %eax,%es + movl %eax,%fs + movl %eax,%gs /* Set up the stack pointer */ lss stack_start,%esp -#ifdef CONFIG_SMP - orw %bx,%bx - jz 1f /* Initial CPU cleans BSS */ - pushl $0 - popfl - jmp checkCPUtype -1: -#endif /* CONFIG_SMP */ - -/* - * Clear BSS first so that there are no surprises... - * No need to cld as DF is already clear from cld above... - */ - xorl %eax,%eax - movl $ SYMBOL_NAME(__bss_start),%edi - movl $ SYMBOL_NAME(_end),%ecx - subl %edi,%ecx - rep - stosb - -/* - * start system 32-bit setup. We need to re-do some of the things done - * in 16-bit mode for the "real" operations. - */ - call setup_idt /* * Initialize eflags. Some BIOS's leave bits like NT set. This would * confuse the debugger if this code is traced. @@ -138,35 +214,18 @@ startup_32: */ pushl $0 popfl + +#ifdef CONFIG_SMP + orw %bx,%bx + jnz checkCPUtype +#endif /* CONFIG_SMP */ + /* - * Copy bootup parameters out of the way. First 2kB of - * _empty_zero_page is for boot parameters, second 2kB - * is for the command line. - * - * Note: %esi still has the pointer to the real-mode data. + * start system 32-bit setup. We need to re-do some of the things done + * in 16-bit mode for the "real" operations. */ - movl $ SYMBOL_NAME(empty_zero_page),%edi - movl $512,%ecx - cld - rep - movsl - xorl %eax,%eax - movl $512,%ecx - rep - stosl - movl SYMBOL_NAME(empty_zero_page)+NEW_CL_POINTER,%esi - andl %esi,%esi - jnz 2f # New command line protocol - cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR - jne 1f - movzwl OLD_CL_OFFSET,%esi - addl $(OLD_CL_BASE_ADDR),%esi -2: - movl $ SYMBOL_NAME(empty_zero_page)+2048,%edi - movl $512,%ecx - rep - movsl -1: + call setup_idt + checkCPUtype: movl $-1,X86_CPUID # -1 for no CPUID initially @@ -241,20 +300,7 @@ is386: pushl %ecx # restore original EF 2: movl %eax,%cr0 call check_x87 incb ready - lgdt gdt_descr lidt idt_descr - ljmp $(__KERNEL_CS),$1f -1: movl $(__KERNEL_DS),%eax # reload all the segment registers - movl %eax,%ds # after changing gdt. - movl %eax,%es - movl %eax,%fs - movl %eax,%gs -#ifdef CONFIG_SMP - movl $(__KERNEL_DS), %eax - movl %eax,%ss # Reload the stack pointer (segment only) -#else - lss stack_start,%esp # Load processor stack -#endif xorl %eax,%eax lldt %ax cld # gcc2 wants the direction flag cleared at all times @@ -272,8 +318,6 @@ L6: jmp L6 # main should never return here, but # just in case, we know what happens. -ready: .byte 0 - /* * We depend on ET to be correct. This checks for 287/387. */ @@ -319,13 +363,6 @@ rp_sidt: jne rp_sidt ret -ENTRY(stack_start) - .long SYMBOL_NAME(init_task_union)+8192 - .long __KERNEL_DS - -/* This is the default interrupt "handler" :-) */ -int_msg: - .asciz "Unknown interrupt, stack: %p %p %p %p\n" ALIGN ignore_int: cld @@ -341,6 +378,18 @@ ignore_int: 1: hlt jmp 1b +.data +ready: .byte 0 + +ENTRY(stack_start) + .long SYMBOL_NAME(init_task_union)+8192-8 + .long __KERNEL_DS + +.section .rodata,"a" +/* This is the default interrupt "handler" :-) */ +int_msg: + .asciz "Unknown interrupt, stack: %p %p %p %p\n" + /* * The interrupt descriptor table has room for 256 idt's, * the global descriptor table is dependent on the number @@ -360,60 +409,134 @@ idt_descr: SYMBOL_NAME(idt): .long SYMBOL_NAME(idt_table) +.globl SYMBOL_NAME(boot_gdt_table) +boot_gdt_table: + .fill __KERNEL_CS,1,0 + .quad 0x00cf9b000000ffff /* 0x10 kernel 4GB code at 0x00000000 */ + .quad 0x00cf93000000ffff /* 0x18 kernel 4GB data at 0x00000000 */ + .word 0 gdt_descr: .word GDT_ENTRIES*8-1 SYMBOL_NAME(gdt): .long SYMBOL_NAME(gdt_table) +#ifdef CONFIG_PAX_SEGMEXEC +.globl SYMBOL_NAME(gdt2) + .word 0 +gdt_descr2: + .word GDT_ENTRIES*8-1 +SYMBOL_NAME(gdt2): + .long SYMBOL_NAME(gdt_table2) +#endif + /* - * This is initialized to create an identity-mapping at 0-8M (for bootup - * purposes) and another mapping of the 0-8M area at virtual address + * This is initialized to create an identity-mapping at 0-16M (for bootup + * purposes) and another mapping of the 0-16M area at virtual address * PAGE_OFFSET. */ -.org 0x1000 +.section .swapper_pg_dir,"a",@progbits ENTRY(swapper_pg_dir) - .long 0x00102007 - .long 0x00103007 - .fill BOOT_USER_PGD_PTRS-2,4,0 - /* default: 766 entries */ - .long 0x00102007 - .long 0x00103007 - /* default: 254 entries */ - .fill BOOT_KERNEL_PGD_PTRS-2,4,0 +#ifdef CONFIG_X86_PAE + .long swapper_pm_dir-__PAGE_OFFSET+1 + .long 0 + .long swapper_pm_dir+512*8-__PAGE_OFFSET+1 + .long 0 + .long swapper_pm_dir+512*16-__PAGE_OFFSET+1 + .long 0 + .long swapper_pm_dir+512*24-__PAGE_OFFSET+1 + .long 0 +#else + .long pg0-__PAGE_OFFSET+63 + .long pg0+1024*4-__PAGE_OFFSET+63 + .long pg0+1024*8-__PAGE_OFFSET+63 + .long pg0+1024*12-__PAGE_OFFSET+63 + .fill BOOT_USER_PGD_PTRS-4,4,0 + /* default: 764 entries */ + .long pg0-__PAGE_OFFSET+67 + .long pg0+1024*4-__PAGE_OFFSET+63 + .long pg0+1024*8-__PAGE_OFFSET+63 + .long pg0+1024*12-__PAGE_OFFSET+63 + /* default: 252 entries */ + .fill BOOT_KERNEL_PGD_PTRS-4,4,0 +#endif + +#ifdef CONFIG_X86_PAE +.section .swapper_pm_dir,"a",@progbits +ENTRY(swapper_pm_dir) + .long pg0-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*8-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*16-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*24-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*32-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*40-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*48-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*56-__PAGE_OFFSET+63 + .long 0 + .fill BOOT_USER_PMD_PTRS-8,8,0 + /* default: 1024+512-4 entries */ + .long pg0-__PAGE_OFFSET+67 + .long 0 + .long pg0+512*8-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*16-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*24-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*32-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*40-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*48-__PAGE_OFFSET+63 + .long 0 + .long pg0+512*56-__PAGE_OFFSET+63 + .long 0 + /* default: 512-4 entries */ + .fill BOOT_KERNEL_PMD_PTRS-8,8,0 +#endif /* - * The page tables are initialized to only 8MB here - the final page + * The page tables are initialized to only 16MB here - the final page * tables are set up later depending on memory size. */ -.org 0x2000 +.section .pg0,"a",@progbits ENTRY(pg0) + .fill 1024*4,4,0 -.org 0x3000 -ENTRY(pg1) +#ifdef CONFIG_X86_PAE + .fill 1024*4,4,0 +#endif /* * empty_zero_page must immediately follow the page tables ! (The * initialization loop counts until empty_zero_page) */ - -.org 0x4000 +.section .empty_zero_page,"a",@progbits ENTRY(empty_zero_page) - -.org 0x5000 + .fill 1024,4,0 /* - * Real beginning of normal "text" segment + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. */ -ENTRY(stext) -ENTRY(_stext) +.section .idt,"a",@progbits +ENTRY(idt_table) + .fill 256,8,0 /* * This starts the data section. Note that the above is all * in the text section because it has alignment requirements * that we cannot fulfill any other way. */ -.data +.section .rodata,"a",@progbits ALIGN /* @@ -425,18 +548,39 @@ ALIGN ENTRY(gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* not used */ - .quad 0x00cf9a000000ffff /* 0x10 kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* 0x18 kernel 4GB data at 0x00000000 */ - .quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */ - .quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */ - .quad 0x0000000000000000 /* not used */ + .quad 0x00cf9b000000ffff /* 0x10 kernel 4GB code at 0x00000000 */ + .quad 0x00cf93000000ffff /* 0x18 kernel 4GB data at 0x00000000 */ + .quad 0x00cffb000000ffff /* 0x23 user 4GB code at 0x00000000 */ + .quad 0x00cff3000000ffff /* 0x2b user 4GB data at 0x00000000 */ + .quad 0x0000000000000000 /* PCIBIOS_CS */ + .quad 0x0000000000000000 /* PCIBIOS_DS */ + /* + * The APM segments have byte granularity and their bases + * and limits are set at run time. + */ + .quad 0x0040930000000000 /* 0x40 APM set up for bad BIOS's */ + .quad 0x00409b0000000000 /* 0x48 APM CS code */ + .quad 0x00009b0000000000 /* 0x50 APM CS 16 code (16 bit) */ + .quad 0x0040930000000000 /* 0x58 APM DS data */ + .fill NR_CPUS*4,8,0 /* space for TSS's and LDT's */ + +#ifdef CONFIG_PAX_SEGMEXEC +ENTRY(gdt_table2) + .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* not used */ + .quad 0x00cf9b000000ffff /* 0x10 kernel 4GB code at 0x00000000 */ + .quad 0x00cf93000000ffff /* 0x18 kernel 4GB data at 0x00000000 */ + .quad 0x60c5fb000000ffff /* 0x23 user 1.5GB code at 0x60000000 */ + .quad 0x00cff3000000ffff /* 0x2b user 4GB data at 0x00000000 */ + .quad 0x0000000000000000 /* PCIBIOS_CS */ + .quad 0x0000000000000000 /* PCIBIOS_DS */ /* * The APM segments have byte granularity and their bases * and limits are set at run time. */ - .quad 0x0040920000000000 /* 0x40 APM set up for bad BIOS's */ - .quad 0x00409a0000000000 /* 0x48 APM CS code */ - .quad 0x00009a0000000000 /* 0x50 APM CS 16 code (16 bit) */ - .quad 0x0040920000000000 /* 0x58 APM DS data */ + .quad 0x0040930000000000 /* 0x40 APM set up for bad BIOS's */ + .quad 0x00409b0000000000 /* 0x48 APM CS code */ + .quad 0x00009b0000000000 /* 0x50 APM CS 16 code (16 bit) */ + .quad 0x0040930000000000 /* 0x58 APM DS data */ .fill NR_CPUS*4,8,0 /* space for TSS's and LDT's */ +#endif diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/i386_ksyms.c linux-2.4.36.6-pax/arch/i386/kernel/i386_ksyms.c --- linux-2.4.36.6/arch/i386/kernel/i386_ksyms.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/i386_ksyms.c 2008-05-08 12:41:32.000000000 +0200 @@ -34,7 +34,7 @@ extern void dump_thread(struct pt_regs * extern spinlock_t rtc_lock; #if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) -extern void machine_real_restart(unsigned char *, int); +extern void machine_real_restart(const unsigned char *, unsigned int); EXPORT_SYMBOL(machine_real_restart); extern void default_idle(void); EXPORT_SYMBOL(default_idle); @@ -74,6 +74,11 @@ EXPORT_SYMBOL(pm_power_off); EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL(apm_info); EXPORT_SYMBOL(gdt); + +#ifdef CONFIG_PAX_SEGMEXEC +EXPORT_SYMBOL(gdt2); +#endif + EXPORT_SYMBOL(empty_zero_page); #ifdef CONFIG_DEBUG_IOVIRT @@ -86,6 +91,8 @@ EXPORT_SYMBOL_NOVERS(__down_failed_trylo EXPORT_SYMBOL_NOVERS(__up_wakeup); /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); +EXPORT_SYMBOL(csum_partial_copy_generic_to_user); +EXPORT_SYMBOL(csum_partial_copy_generic_from_user); /* Delay loops */ EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(__udelay); diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/i8259.c linux-2.4.36.6-pax/arch/i386/kernel/i8259.c --- linux-2.4.36.6/arch/i386/kernel/i8259.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/i8259.c 2008-05-08 12:41:32.000000000 +0200 @@ -107,7 +107,8 @@ BUILD_SMP_INTERRUPT(spurious_interrupt,S IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) -void (*interrupt[NR_IRQS])(void) = { +typedef void (*interrupt_t)(void); +const interrupt_t interrupt[NR_IRQS] = { IRQLIST_16(0x0), #ifdef CONFIG_X86_IO_APIC diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/init_task.c linux-2.4.36.6-pax/arch/i386/kernel/init_task.c --- linux-2.4.36.6/arch/i386/kernel/init_task.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/init_task.c 2008-05-08 12:41:32.000000000 +0200 @@ -29,5 +29,9 @@ union task_union init_task_union * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +#ifdef CONFIG_PAX_KERNEXEC +struct tss_struct init_tss[NR_CPUS] __attribute__((__aligned__(SMP_CACHE_BYTES), __section__(".rodata"))) = { [0 ... NR_CPUS-1] = INIT_TSS }; +#else +struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +#endif diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/io_apic.c linux-2.4.36.6-pax/arch/i386/kernel/io_apic.c --- linux-2.4.36.6/arch/i386/kernel/io_apic.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/io_apic.c 2008-05-08 12:41:32.000000000 +0200 @@ -620,7 +620,8 @@ next: return current_vector; } -extern void (*interrupt[NR_IRQS])(void); +typedef void (*interrupt_t)(void); +extern const interrupt_t interrupt[NR_IRQS]; static struct hw_interrupt_type ioapic_level_irq_type; static struct hw_interrupt_type ioapic_edge_irq_type; diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/ioport.c linux-2.4.36.6-pax/arch/i386/kernel/ioport.c --- linux-2.4.36.6/arch/i386/kernel/ioport.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/ioport.c 2008-05-08 12:41:32.000000000 +0200 @@ -14,6 +14,7 @@ #include #include #include +#include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, short base, short extent, int new_value) @@ -57,6 +58,10 @@ asmlinkage int sys_ioperm(unsigned long struct thread_struct * t = ¤t->thread; struct tss_struct * tss = init_tss + smp_processor_id(); +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; +#endif + if ((from + num <= from) || (from + num > IO_BITMAP_SIZE*32)) return -EINVAL; if (turn_on && !capable(CAP_SYS_RAWIO)) @@ -78,6 +83,11 @@ asmlinkage int sys_ioperm(unsigned long * do it in the per-thread copy and in the TSS ... */ set_bitmap(t->io_bitmap, from, num, !turn_on); + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + if (tss->bitmap == IO_BITMAP_OFFSET) { /* already active? */ set_bitmap(tss->io_bitmap, from, num, !turn_on); } else { @@ -85,6 +95,10 @@ asmlinkage int sys_ioperm(unsigned long tss->bitmap = IO_BITMAP_OFFSET; /* Activate it in the TSS */ } +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + return 0; } diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/ldt.c linux-2.4.36.6-pax/arch/i386/kernel/ldt.c --- linux-2.4.36.6/arch/i386/kernel/ldt.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/ldt.c 2008-05-08 12:41:32.000000000 +0200 @@ -151,7 +151,7 @@ static int read_default_ldt(void * ptr, { int err; unsigned long size; - void *address; + const void *address; err = 0; address = &default_ldt[0]; @@ -214,6 +214,13 @@ static int write_ldt(void * ptr, unsigne } } +#ifdef CONFIG_PAX_SEGMEXEC + if ((mm->pax_flags & MF_PAX_SEGMEXEC) && (ldt_info.contents & 2)) { + error = -EINVAL; + goto out_unlock; + } +#endif + entry_1 = ((ldt_info.base_addr & 0x0000ffff) << 16) | (ldt_info.limit & 0x0ffff); entry_2 = (ldt_info.base_addr & 0xff000000) | @@ -224,7 +231,7 @@ static int write_ldt(void * ptr, unsigne ((ldt_info.seg_not_present ^ 1) << 15) | (ldt_info.seg_32bit << 22) | (ldt_info.limit_in_pages << 23) | - 0x7000; + 0x7100; if (!oldmode) entry_2 |= (ldt_info.useable << 20); diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/mpparse.c linux-2.4.36.6-pax/arch/i386/kernel/mpparse.c --- linux-2.4.36.6/arch/i386/kernel/mpparse.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/mpparse.c 2008-05-08 12:41:32.000000000 +0200 @@ -833,7 +833,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/pci-pc.c linux-2.4.36.6-pax/arch/i386/kernel/pci-pc.c --- linux-2.4.36.6/arch/i386/kernel/pci-pc.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/pci-pc.c 2008-05-08 12:41:32.000000000 +0200 @@ -17,6 +17,7 @@ #include #include #include +#include #include "pci-i386.h" @@ -575,11 +576,10 @@ union bios32 { * we'll make pcibios_present() take a memory start parameter and store * the array there. */ - static struct { unsigned long address; unsigned short segment; -} bios32_indirect = { 0, __KERNEL_CS }; +} bios32_indirect = { 0, __PCIBIOS_CS }; /* * Returns the entry point for the given service, NULL on error @@ -593,34 +593,122 @@ static unsigned long bios32_service(unsi unsigned long entry; /* %edx */ unsigned long flags; +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; +#endif + __save_flags(flags); __cli(); - __asm__("lcall (%%edi); cld" + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + + gdt_table[6].a = 0x0000FFFFUL; + gdt_table[6].b = 0x00CF9B00UL; + gdt_table[7].a = 0x0000FFFFUL; + gdt_table[7].b = 0x00CF9300UL; + +#ifdef CONFIG_PAX_SEGMEXEC + gdt_table2[6].a = 0x0000FFFFUL; + gdt_table2[6].b = 0x00CF9B00UL; + gdt_table2[7].a = 0x0000FFFFUL; + gdt_table2[7].b = 0x00CF9300UL; +#endif + +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + + __asm__("movw %w7, %%ds; lcall *(%%edi); push %%ss; pop %%ds; cld" : "=a" (return_code), "=b" (address), "=c" (length), "=d" (entry) : "0" (service), "1" (0), - "D" (&bios32_indirect)); + "D" (&bios32_indirect), + "r" (__PCIBIOS_DS) + : "memory"); + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + + gdt_table[6].a = 0; + gdt_table[6].b = 0; + gdt_table[7].a = 0; + gdt_table[7].b = 0; + +#ifdef CONFIG_PAX_SEGMEXEC + gdt_table2[6].a = 0; + gdt_table2[6].b = 0; + gdt_table2[7].a = 0; + gdt_table2[7].b = 0; +#endif + +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + __restore_flags(flags); switch (return_code) { - case 0: - return address + entry; - case 0x80: /* Not present */ - printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service); - return 0; - default: /* Shouldn't happen */ - printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n", - service, return_code); + case 0: { + unsigned long a, b1, b2; + unsigned char flags; + + printk(KERN_INFO "bios32_service: base:%08lx length:%08lx entry:%08lx\n", address, length, entry); + if (address >= 0xFFFF0 || length >= 0xFFFF0 - address || length <= entry) { + printk(KERN_WARNING "bios32_service: not valid\n"); return 0; + } + address = address + PAGE_OFFSET; + length += 16UL; /* some BIOSs underreport this... */ + flags = 4; + if (length >= 64*1024*1024) { + length >>= PAGE_SHIFT; + flags |= 8; + } + a = (length & 0xFFFFUL) | ((address & 0xFFFFUL) << 16); + b1 = (address & 0xFF000000UL) | ((address & 0x00FF0000UL) >> 16) | (length & 0xF0000UL) | (flags << 20) | 0x9B00UL; + b2 = (address & 0xFF000000UL) | ((address & 0x00FF0000UL) >> 16) | (length & 0xF0000UL) | (flags << 20) | 0x9300UL; + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + + gdt_table[6].a = a; + gdt_table[6].b = b1; + gdt_table[7].a = a; + gdt_table[7].b = b2; + +#ifdef CONFIG_PAX_SEGMEXEC + gdt_table2[6].a = a; + gdt_table2[6].b = b1; + gdt_table2[7].a = a; + gdt_table2[7].b = b2; +#endif + +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + + return entry; + } + case 0x80: /* Not present */ + printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service); + return 0; + default: /* Shouldn't happen */ + printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n", + service, return_code); + return 0; } } static struct { unsigned long address; unsigned short segment; -} pci_indirect = { 0, __KERNEL_CS }; +} pci_indirect = { 0, __PCIBIOS_CS }; static int pci_bios_present; @@ -631,11 +719,13 @@ static int __devinit check_pcibios(void) unsigned long flags, pcibios_entry; if ((pcibios_entry = bios32_service(PCI_SERVICE))) { - pci_indirect.address = pcibios_entry + PAGE_OFFSET; + pci_indirect.address = pcibios_entry; __save_flags(flags); __cli(); - __asm__( - "lcall (%%edi); cld\n\t" + __asm__("movw %w6, %%ds\n\t" + "lcall *%%ss:(%%edi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -644,7 +734,8 @@ static int __devinit check_pcibios(void) "=b" (ebx), "=c" (ecx) : "1" (PCIBIOS_PCI_BIOS_PRESENT), - "D" (&pci_indirect) + "D" (&pci_indirect), + "r" (__PCIBIOS_DS) : "memory"); __restore_flags(flags); @@ -680,7 +771,10 @@ static int __devinit pci_bios_find_devic unsigned short bx; unsigned short ret; - __asm__("lcall (%%edi); cld\n\t" + __asm__("movw %w7, %%ds\n\t" + "lcall *%%ss:(%%edi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -690,7 +784,8 @@ static int __devinit pci_bios_find_devic "c" (device_id), "d" (vendor), "S" ((int) index), - "D" (&pci_indirect)); + "D" (&pci_indirect), + "r" (__PCIBIOS_DS)); *bus = (bx >> 8) & 0xff; *device_fn = bx & 0xff; return (int) (ret & 0xff00) >> 8; @@ -709,7 +804,10 @@ static int pci_bios_read (int seg, int b switch (len) { case 1: - __asm__("lcall (%%esi); cld\n\t" + __asm__("movw %w6, %%ds\n\t" + "lcall *%%ss:(%%esi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -718,10 +816,14 @@ static int pci_bios_read (int seg, int b : "1" (PCIBIOS_READ_CONFIG_BYTE), "b" (bx), "D" ((long)reg), - "S" (&pci_indirect)); + "S" (&pci_indirect), + "r" (__PCIBIOS_DS)); break; case 2: - __asm__("lcall (%%esi); cld\n\t" + __asm__("movw %w6, %%ds\n\t" + "lcall *%%ss:(%%esi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -730,10 +832,14 @@ static int pci_bios_read (int seg, int b : "1" (PCIBIOS_READ_CONFIG_WORD), "b" (bx), "D" ((long)reg), - "S" (&pci_indirect)); + "S" (&pci_indirect), + "r" (__PCIBIOS_DS)); break; case 4: - __asm__("lcall (%%esi); cld\n\t" + __asm__("movw %w6, %%ds\n\t" + "lcall *%%ss:(%%esi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -742,7 +848,8 @@ static int pci_bios_read (int seg, int b : "1" (PCIBIOS_READ_CONFIG_DWORD), "b" (bx), "D" ((long)reg), - "S" (&pci_indirect)); + "S" (&pci_indirect), + "r" (__PCIBIOS_DS)); break; } @@ -764,7 +871,10 @@ static int pci_bios_write (int seg, int switch (len) { case 1: - __asm__("lcall (%%esi); cld\n\t" + __asm__("movw %w6, %%ds\n\t" + "lcall *%%ss:(%%esi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -773,10 +883,14 @@ static int pci_bios_write (int seg, int "c" (value), "b" (bx), "D" ((long)reg), - "S" (&pci_indirect)); + "S" (&pci_indirect), + "r" (__PCIBIOS_DS)); break; case 2: - __asm__("lcall (%%esi); cld\n\t" + __asm__("movw %w6, %%ds\n\t" + "lcall *%%ss:(%%esi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -785,10 +899,14 @@ static int pci_bios_write (int seg, int "c" (value), "b" (bx), "D" ((long)reg), - "S" (&pci_indirect)); + "S" (&pci_indirect), + "r" (__PCIBIOS_DS)); break; case 4: - __asm__("lcall (%%esi); cld\n\t" + __asm__("movw %w6, %%ds\n\t" + "lcall *%%ss:(%%esi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n\t" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -797,7 +915,8 @@ static int pci_bios_write (int seg, int "c" (value), "b" (bx), "D" ((long)reg), - "S" (&pci_indirect)); + "S" (&pci_indirect), + "r" (__PCIBIOS_DS)); break; } @@ -1009,10 +1128,13 @@ struct irq_routing_table * __devinit pci DBG("PCI: Fetching IRQ routing table... "); __asm__("push %%es\n\t" + "movw %w8, %%ds\n\t" "push %%ds\n\t" "pop %%es\n\t" - "lcall (%%esi); cld\n\t" + "lcall *%%ss:(%%esi); cld\n\t" "pop %%es\n\t" + "push %%ss\n\t" + "pop %%ds\n" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -1023,7 +1145,8 @@ struct irq_routing_table * __devinit pci "1" (0), "D" ((long) &opt), "S" (&pci_indirect), - "m" (opt) + "m" (opt), + "r" (__PCIBIOS_DS) : "memory"); DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map); if (ret & 0xff00) @@ -1047,7 +1170,10 @@ int pcibios_set_irq_routing(struct pci_d { int ret; - __asm__("lcall (%%esi); cld\n\t" + __asm__("movw %w5, %%ds\n\t" + "lcall *%%ss:(%%esi); cld\n\t" + "push %%ss\n\t" + "pop %%ds\n" "jc 1f\n\t" "xor %%ah, %%ah\n" "1:" @@ -1055,7 +1181,8 @@ int pcibios_set_irq_routing(struct pci_d : "0" (PCIBIOS_SET_PCI_HW_INT), "b" ((dev->bus->number << 8) | dev->devfn), "c" ((irq << 8) | (pin + 10)), - "S" (&pci_indirect)); + "S" (&pci_indirect), + "r" (__PCIBIOS_DS)); return !(ret & 0xff00); } diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/process.c linux-2.4.36.6-pax/arch/i386/kernel/process.c --- linux-2.4.36.6/arch/i386/kernel/process.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/process.c 2008-05-08 12:41:32.000000000 +0200 @@ -153,7 +153,7 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -static int reboot_mode; +static unsigned short reboot_mode; int reboot_thru_bios; #ifdef CONFIG_SMP @@ -209,18 +209,18 @@ __setup("reboot=", reboot_setup); doesn't work with at least one type of 486 motherboard. It is easy to stop this code working; hence the copious comments. */ -static unsigned long long +static const unsigned long long real_mode_gdt_entries [3] = { 0x0000000000000000ULL, /* Null descriptor */ - 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ - 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ + 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ + 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct +static const struct { unsigned short size __attribute__ ((packed)); - unsigned long long * base __attribute__ ((packed)); + const unsigned long long * base __attribute__ ((packed)); } real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries }, real_mode_idt = { 0x3ff, 0 }, @@ -245,7 +245,7 @@ no_idt = { 0, 0 }; More could be done here to set up the registers as if a CPU reset had occurred; hopefully real BIOSs don't assume much. */ -static unsigned char real_mode_switch [] = +static const unsigned char real_mode_switch [] = { 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ @@ -259,7 +259,7 @@ static unsigned char real_mode_switch [] 0x24, 0x10, /* f: andb $0x10,al */ 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ }; -static unsigned char jump_to_bios [] = +static const unsigned char jump_to_bios [] = { 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ }; @@ -278,10 +278,14 @@ static inline void kb_wait(void) * specified by the code and length parameters. * We assume that length will aways be less that 100! */ -void machine_real_restart(unsigned char *code, int length) +void machine_real_restart(const unsigned char *code, unsigned int length) { unsigned long flags; +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; +#endif + cli(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -302,9 +306,17 @@ void machine_real_restart(unsigned char from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + /* Make sure the first page is mapped to the start of physical memory. It is normally not mapped, to trap kernel NULL pointer dereferences. */ @@ -321,7 +333,7 @@ void machine_real_restart(unsigned char REBOOT.COM programs, and the previous reset routine did this too. */ - *((unsigned short *)0x472) = reboot_mode; + __put_user(reboot_mode, (unsigned short *)0x472); /* For the switch to real mode, copy some code to low memory. It has to be in the first 64k because it is running in 16-bit mode, and it @@ -329,9 +341,9 @@ void machine_real_restart(unsigned char off paging. Copy it near the end of the first page, out of the way of BIOS variables. */ - memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100), + __copy_to_user ((void *) (0x1000 - sizeof (real_mode_switch) - 100), real_mode_switch, sizeof (real_mode_switch)); - memcpy ((void *) (0x1000 - 100), code, length); + __copy_to_user ((void *) (0x1000 - 100), code, length); /* Set up the IDT for real mode. */ @@ -414,7 +426,7 @@ void machine_restart(char * __unused) if(!reboot_thru_bios) { /* rebooting needs to touch the page at absolute addr 0 */ - *((unsigned short *)__va(0x472)) = reboot_mode; + __put_user(reboot_mode, (unsigned short *)0x472); for (;;) { int i; for (i=0; i<100; i++) { @@ -552,7 +564,7 @@ int copy_thread(int nr, unsigned long cl { struct pt_regs * childregs; - childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; + childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p - sizeof(unsigned long))) - 1; struct_cpy(childregs, regs); childregs->eax = 0; childregs->esp = esp; @@ -613,6 +625,19 @@ void dump_thread(struct pt_regs * regs, dump->u_fpvalid = dump_fpu (regs, &dump->i387); } +#ifdef CONFIG_PAX_SEGMEXEC +void pax_switch_segments(struct task_struct * tsk) +{ + if (!tsk->mm) + return; + + if (tsk->mm->pax_flags & MF_PAX_SEGMEXEC) + __asm__ __volatile__("lgdt %0": "=m" (gdt_descr2)); + else + __asm__ __volatile__("lgdt %0": "=m" (gdt_descr)); +} +#endif + /* * This special macro can be used to load a debugging register */ @@ -650,12 +675,15 @@ void fastcall __switch_to(struct task_st *next = &next_p->thread; struct tss_struct *tss = init_tss + smp_processor_id(); +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; +#endif + unlazy_fpu(prev_p); - /* - * Reload esp0, LDT and the page table pointer: - */ - tss->esp0 = next->esp0; +#ifdef CONFIG_PAX_SEGMEXEC + pax_switch_segments(next_p); +#endif /* * Save away %fs and %gs. No need to save %es and %ds, as @@ -683,6 +711,15 @@ void fastcall __switch_to(struct task_st loaddebug(next, 7); } +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + + /* + * Reload esp0, LDT and the page table pointer: + */ + tss->esp0 = next->esp0; + if (prev->ioperm || next->ioperm) { if (next->ioperm) { /* @@ -705,6 +742,11 @@ void fastcall __switch_to(struct task_st */ tss->bitmap = INVALID_IO_BITMAP_OFFSET; } + +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + } asmlinkage int sys_fork(struct pt_regs regs) @@ -792,3 +834,44 @@ unsigned long get_wchan(struct task_stru } #undef last_sched #undef first_sched + +#ifdef CONFIG_PAX_RANDKSTACK +asmlinkage void pax_randomize_kstack(void) +{ + struct tss_struct *tss; + unsigned long time; + +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; +#endif + +#ifdef CONFIG_PAX_SOFTMODE + if (!pax_aslr) + return; +#endif + + tss = init_tss + smp_processor_id(); + rdtscl(time); + + /* P4 seems to return a 0 LSB, ignore it */ +#ifdef CONFIG_MPENTIUM4 + time &= 0x1EUL; + time <<= 2; +#else + time &= 0xFUL; + time <<= 3; +#endif + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + + tss->esp0 ^= time; + current->thread.esp0 = tss->esp0; + +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + +} +#endif diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/setup.c linux-2.4.36.6-pax/arch/i386/kernel/setup.c --- linux-2.4.36.6/arch/i386/kernel/setup.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/setup.c 2008-05-08 12:41:32.000000000 +0200 @@ -129,7 +129,11 @@ char ignore_irq13; /* set if exception 16 works */ struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +#ifdef CONFIG_X86_PAE +unsigned long mmu_cr4_features = X86_CR4_PAE; +#else unsigned long mmu_cr4_features; +#endif EXPORT_SYMBOL(mmu_cr4_features); /* @@ -170,7 +174,7 @@ unsigned char aux_device_present; extern void mcheck_init(struct cpuinfo_x86 *c); extern void dmi_scan_machine(void); extern int root_mountflags; -extern char _text, _etext, _edata, _end; +extern char _text, _etext, _data, _edata, _end; static int have_cpuid_p(void) __init; @@ -1209,14 +1213,14 @@ void __init setup_arch(char **cmdline_p) if (!MOUNT_ROOT_RDONLY) root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) &_text; - init_mm.end_code = (unsigned long) &_etext; + init_mm.start_code = (unsigned long) &_text + __KERNEL_TEXT_OFFSET; + init_mm.end_code = (unsigned long) &_etext + __KERNEL_TEXT_OFFSET; init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; - code_resource.start = virt_to_bus(&_text); - code_resource.end = virt_to_bus(&_etext)-1; - data_resource.start = virt_to_bus(&_etext); + code_resource.start = virt_to_bus(&_text + __KERNEL_TEXT_OFFSET); + code_resource.end = virt_to_bus(&_etext + __KERNEL_TEXT_OFFSET)-1; + data_resource.start = virt_to_bus(&_data); data_resource.end = virt_to_bus(&_edata)-1; parse_cmdline_early(cmdline_p); @@ -3184,6 +3188,10 @@ void __init cpu_init (void) int nr = smp_processor_id(); struct tss_struct * t = &init_tss[nr]; +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; +#endif + if (test_and_set_bit(nr, &cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", nr); for (;;) __sti(); @@ -3218,10 +3226,19 @@ void __init cpu_init (void) BUG(); enter_lazy_tlb(&init_mm, current, nr); - t->esp0 = current->thread.esp0; set_tss_desc(nr,t); - gdt_table[__TSS(nr)].b &= 0xfffffdff; + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + + t->esp0 = current->thread.esp0; load_TR(nr); + +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + load_LDT(&init_mm.context); /* @@ -3288,7 +3305,53 @@ int __init ppro_with_ram_bug(void) printk(KERN_INFO "Your Pentium Pro seems ok.\n"); return 0; } - + +static int current_ypos = 25, current_xpos; +#define VGABASE (0xb8000) +#define VGAXY(x, y) (VGABASE + 2 * (x + y * SCREEN_INFO.orig_video_cols)) + +static void early_vga_write(const char *str, int n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= SCREEN_INFO.orig_video_lines) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < SCREEN_INFO.orig_video_lines; k++, j++) { + for (i = 0; i < SCREEN_INFO.orig_video_cols; i++) { + isa_writew(isa_readw(VGAXY(i, k)), VGAXY(i, j)); + } + } + for (i = 0; i < SCREEN_INFO.orig_video_cols; i++) + isa_writew(0x720, VGAXY(i, j)); + current_ypos = SCREEN_INFO.orig_video_lines-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + isa_writew((0x700 | (unsigned short) c), VGAXY(current_xpos, current_ypos)); + if (++current_xpos >= SCREEN_INFO.orig_video_cols) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +asmlinkage void __init early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + + va_start(ap, fmt); + n = vsnprintf(buf, 512, fmt, ap); + early_vga_write(buf, n); + va_end(ap); +} + /* * Local Variables: * mode:c diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/sys_i386.c linux-2.4.36.6-pax/arch/i386/kernel/sys_i386.c --- linux-2.4.36.6/arch/i386/kernel/sys_i386.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/sys_i386.c 2008-05-08 12:41:32.000000000 +0200 @@ -48,6 +48,11 @@ static inline long do_mmap2( int error = -EBADF; struct file * file = NULL; +#ifdef CONFIG_PAX_SEGMEXEC + if (flags & MAP_MIRROR) + return -EINVAL; +#endif + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); if (!(flags & MAP_ANONYMOUS)) { file = fget(fd); diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/trampoline.S linux-2.4.36.6-pax/arch/i386/kernel/trampoline.S --- linux-2.4.36.6/arch/i386/kernel/trampoline.S 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/trampoline.S 2008-05-08 12:41:32.000000000 +0200 @@ -54,7 +54,7 @@ r_base = . lmsw %ax # into protected mode jmp flush_instr flush_instr: - ljmpl $__KERNEL_CS, $0x00100000 + ljmpl $__KERNEL_CS, $SYMBOL_NAME(startup_32) + __KERNEL_TEXT_OFFSET - __PAGE_OFFSET # jump to startup_32 in arch/i386/kernel/head.S idt_48: @@ -62,8 +62,8 @@ idt_48: .word 0, 0 # idt base = 0L gdt_48: - .word 0x0800 # gdt limit = 2048, 256 GDT entries - .long gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU) + .word __KERNEL_DS+7 # gdt limit = just the minimum + .long boot_gdt_table-__PAGE_OFFSET # gdt base = boot_gdt (first SMP CPU) .globl SYMBOL_NAME(trampoline_end) SYMBOL_NAME_LABEL(trampoline_end) diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/traps.c linux-2.4.36.6-pax/arch/i386/kernel/traps.c --- linux-2.4.36.6/arch/i386/kernel/traps.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/traps.c 2008-05-08 12:41:32.000000000 +0200 @@ -54,15 +54,10 @@ asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, +const struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } }; -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. - */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; +extern struct desc_struct idt_table[256]; asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -87,6 +82,7 @@ asmlinkage void machine_check(void); int kstack_depth_to_print = 24; +extern char _text, _sinittext, _einittext; /* * If the address is either in the .text section of the @@ -104,6 +100,10 @@ static inline int kernel_text_address(un int retval = 0; struct module *mod; + if (addr >= (unsigned long) &_sinittext && + addr <= (unsigned long) &_einittext) + return 1; + if (addr >= (unsigned long) &_stext && addr <= (unsigned long) &_etext) return 1; @@ -125,8 +125,15 @@ static inline int kernel_text_address(un static inline int kernel_text_address(unsigned long addr) { - return (addr >= (unsigned long) &_stext && - addr <= (unsigned long) &_etext); + if (addr >= (unsigned long) &_sinittext && + addr <= (unsigned long) &_einittext) + return 1; + + if (addr >= (unsigned long) &_stext && + addr <= (unsigned long) &_etext) + return 1; + + return 0; } #endif @@ -228,13 +235,13 @@ void show_registers(struct pt_regs *regs show_stack((unsigned long*)esp); printk("\nCode: "); - if(regs->eip < PAGE_OFFSET) + if(regs->eip + __KERNEL_TEXT_OFFSET < PAGE_OFFSET) goto bad; for(i=0;i<20;i++) { unsigned char c; - if(__get_user(c, &((unsigned char*)regs->eip)[i])) { + if(__get_user(c, &((unsigned char*)regs->eip)[i+__KERNEL_TEXT_OFFSET])) { bad: printk(" Bad EIP value."); break; @@ -256,7 +263,7 @@ static void handle_BUG(struct pt_regs *r if (regs->xcs & 3) goto no_bug; /* Not in kernel */ - eip = regs->eip; + eip = regs->eip + __KERNEL_TEXT_OFFSET; if (eip < PAGE_OFFSET) goto no_bug; @@ -264,10 +271,11 @@ static void handle_BUG(struct pt_regs *r goto no_bug; if (ud2 != 0x0b0f) goto no_bug; - if (__get_user(line, (unsigned short *)(eip + 2))) + if (__get_user(line, (unsigned short *)(eip + 7))) goto bug; - if (__get_user(file, (char **)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + if (__get_user(file, (char **)(eip + 3)) || file < &_text + __KERNEL_TEXT_OFFSET) + goto bug; + if (__get_user(c, file)) file = ""; printk("kernel BUG at %s:%d!\n", file, line); @@ -422,6 +430,13 @@ gp_in_kernel: regs->eip = fixup; return; } + +#ifdef CONFIG_PAX_KERNEXEC + if ((regs->xcs & 0xFFFF) == __KERNEL_CS) + die("PAX: suspicious general protection fault", regs, error_code); + else +#endif + die("general protection fault", regs, error_code); } } @@ -527,13 +542,12 @@ asmlinkage void do_debug(struct pt_regs { unsigned int condition; struct task_struct *tsk = current; - unsigned long eip = regs->eip; siginfo_t info; __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); /* If the user set TF, it's simplest to clear it right away. */ - if ((eip >=PAGE_OFFSET) && (regs->eflags & TF_MASK)) + if (!(regs->xcs & 3) && (regs->eflags & TF_MASK) && !(regs->eflags & VM_MASK)) goto clear_TF; /* Mask out spurious debug traps due to lazy DR7 setting */ @@ -778,6 +792,8 @@ asmlinkage void math_emulate(long arg) #ifndef CONFIG_X86_F00F_WORKS_OK void __init trap_init_f00f_bug(void) { + +#ifndef CONFIG_PAX_KERNEXEC /* * "idt" is magic - it overlaps the idt_descr * variable so that updating idt will automatically @@ -787,12 +803,17 @@ void __init trap_init_f00f_bug(void) idt = (struct desc_struct *)__fix_to_virt(FIX_F00F); __asm__ __volatile__("lidt %0": "=m" (idt_descr)); +#endif + } #endif +#ifdef CONFIG_PAX_KERNEXEC #define _set_gate(gate_addr,type,dpl,addr) \ do { \ int __d0, __d1; \ + unsigned long cr0; \ + pax_open_kernel(cr0); \ __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ "movw %4,%%dx\n\t" \ "movl %%eax,%0\n\t" \ @@ -801,8 +822,22 @@ do { \ "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ "3" ((char *) (addr)),"2" (__KERNEL_CS << 16)); \ + pax_close_kernel(cr0); \ } while (0) - +#else +#define _set_gate(gate_addr,type,dpl,addr) \ +do { \ + int __d0, __d1; \ + __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ + "movw %4,%%dx\n\t" \ + "movl %%eax,%0\n\t" \ + "movl %%edx,%1" \ + :"=m" (*((long *) (gate_addr))), \ + "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ + :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ + "3" ((char *) (addr)),"2" (__KERNEL_CS << 16)); \ +} while (0) +#endif /* * This needs to use 'idt_table' rather than 'idt', and @@ -810,26 +845,42 @@ do { \ * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ -void set_intr_gate(unsigned int n, void *addr) +void set_intr_gate(unsigned int n, const void *addr) { _set_gate(idt_table+n,14,0,addr); } -static void __init set_trap_gate(unsigned int n, void *addr) +static void __init set_trap_gate(unsigned int n, const void *addr) { _set_gate(idt_table+n,15,0,addr); } -static void __init set_system_gate(unsigned int n, void *addr) +static void __init set_system_gate(unsigned int n, const void *addr) { _set_gate(idt_table+n,15,3,addr); } -static void __init set_call_gate(void *a, void *addr) +static void __init set_call_gate(const void *a, const void *addr) { _set_gate(a,12,3,addr); } +#ifdef CONFIG_PAX_KERNEXEC +#define _set_seg_desc(gate_addr,type,dpl,base,limit) \ +do {\ + unsigned long cr0; \ + pax_open_kernel(cr0); \ + *((gate_addr)+1) = ((base) & 0xff000000) | \ + (((base) & 0x00ff0000)>>16) | \ + ((limit) & 0xf0000) | \ + ((dpl)<<13) | \ + (0x00408000) | \ + ((type)<<8); \ + *(gate_addr) = (((base) & 0x0000ffff)<<16) | \ + ((limit) & 0x0ffff); \ + pax_close_kernel(cr0); \ +} while (0) +#else #define _set_seg_desc(gate_addr,type,dpl,base,limit) {\ *((gate_addr)+1) = ((base) & 0xff000000) | \ (((base) & 0x00ff0000)>>16) | \ @@ -839,7 +890,25 @@ static void __init set_call_gate(void *a ((type)<<8); \ *(gate_addr) = (((base) & 0x0000ffff)<<16) | \ ((limit) & 0x0ffff); } +#endif +#ifdef CONFIG_PAX_KERNEXEC +#define _set_tssldt_desc(n,addr,limit,type) \ +do { \ + unsigned long cr0; \ + pax_open_kernel(cr0); \ + __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ + "movw %%ax,2(%2)\n\t" \ + "rorl $16,%%eax\n\t" \ + "movb %%al,4(%2)\n\t" \ + "movb %4,5(%2)\n\t" \ + "movb $0,6(%2)\n\t" \ + "movb %%ah,7(%2)\n\t" \ + "rorl $16,%%eax" \ + : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type)); \ + pax_close_kernel(cr0); \ +} while (0) +#else #define _set_tssldt_desc(n,addr,limit,type) \ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ "movw %%ax,2(%2)\n\t" \ @@ -850,15 +919,26 @@ __asm__ __volatile__ ("movw %w3,0(%2)\n\ "movb %%ah,7(%2)\n\t" \ "rorl $16,%%eax" \ : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type)) +#endif -void set_tss_desc(unsigned int n, void *addr) +void set_tss_desc(unsigned int n, const void *addr) { _set_tssldt_desc(gdt_table+__TSS(n), (int)addr, 235, 0x89); + +#ifdef CONFIG_PAX_SEGMEXEC + _set_tssldt_desc(gdt_table2+__TSS(n), (int)addr, 235, 0x89); +#endif + } -void set_ldt_desc(unsigned int n, void *addr, unsigned int size) +void set_ldt_desc(unsigned int n, const void *addr, unsigned int size) { _set_tssldt_desc(gdt_table+__LDT(n), (int)addr, ((size << 3)-1), 0x82); + +#ifdef CONFIG_PAX_SEGMEXEC + _set_tssldt_desc(gdt_table2+__LDT(n), (int)addr, ((size << 3)-1), 0x82); +#endif + } #ifdef CONFIG_X86_VISWS_APIC diff -NurpX nopatch linux-2.4.36.6/arch/i386/kernel/vm86.c linux-2.4.36.6-pax/arch/i386/kernel/vm86.c --- linux-2.4.36.6/arch/i386/kernel/vm86.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/kernel/vm86.c 2008-05-08 12:41:32.000000000 +0200 @@ -44,6 +44,7 @@ #include #include #include +#include /* * Known problems: @@ -97,6 +98,10 @@ struct pt_regs * fastcall save_v86_state struct pt_regs *ret; unsigned long tmp; +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; +#endif + if (!current->thread.vm86_info) { printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); @@ -111,7 +116,17 @@ struct pt_regs * fastcall save_v86_state do_exit(SIGSEGV); } tss = init_tss + smp_processor_id(); + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + tss->esp0 = current->thread.esp0 = current->thread.saved_esp0; + +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + current->thread.saved_esp0 = 0; ret = KVM86->regs32; return ret; @@ -237,6 +252,11 @@ out: static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { struct tss_struct *tss; + +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr3; +#endif + /* * make sure the vm86() system call doesn't try to do anything silly */ @@ -278,8 +298,17 @@ static void do_sys_vm86(struct kernel_vm info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; tss = init_tss + smp_processor_id(); + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr3); +#endif + tss->esp0 = tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr3); +#endif + tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk); diff -NurpX nopatch linux-2.4.36.6/arch/i386/lib/checksum.S linux-2.4.36.6-pax/arch/i386/lib/checksum.S --- linux-2.4.36.6/arch/i386/lib/checksum.S 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/lib/checksum.S 2008-05-08 12:41:32.000000000 +0200 @@ -27,7 +27,8 @@ #include #include - +#include + /* * computes a partial checksum, e.g. for TCP/UDP fragments */ @@ -281,12 +282,23 @@ unsigned int csum_partial_copy_generic ( .align 4 .globl csum_partial_copy_generic - +.globl csum_partial_copy_generic_to_user +.globl csum_partial_copy_generic_from_user + #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 #define FP 12 - + +csum_partial_copy_generic_to_user: + pushl $(__USER_DS) + popl %es + jmp csum_partial_copy_generic + +csum_partial_copy_generic_from_user: + pushl $(__USER_DS) + popl %ds + csum_partial_copy_generic: subl $4,%esp pushl %edi @@ -305,7 +317,7 @@ csum_partial_copy_generic: jmp 4f SRC(1: movw (%esi), %bx ) addl $2, %esi -DST( movw %bx, (%edi) ) +DST( movw %bx, %es:(%edi) ) addl $2, %edi addw %bx, %ax adcl $0, %eax @@ -317,30 +329,30 @@ DST( movw %bx, (%edi) ) SRC(1: movl (%esi), %ebx ) SRC( movl 4(%esi), %edx ) adcl %ebx, %eax -DST( movl %ebx, (%edi) ) +DST( movl %ebx, %es:(%edi) ) adcl %edx, %eax -DST( movl %edx, 4(%edi) ) +DST( movl %edx, %es:4(%edi) ) SRC( movl 8(%esi), %ebx ) SRC( movl 12(%esi), %edx ) adcl %ebx, %eax -DST( movl %ebx, 8(%edi) ) +DST( movl %ebx, %es:8(%edi) ) adcl %edx, %eax -DST( movl %edx, 12(%edi) ) +DST( movl %edx, %es:12(%edi) ) SRC( movl 16(%esi), %ebx ) SRC( movl 20(%esi), %edx ) adcl %ebx, %eax -DST( movl %ebx, 16(%edi) ) +DST( movl %ebx, %es:16(%edi) ) adcl %edx, %eax -DST( movl %edx, 20(%edi) ) +DST( movl %edx, %es:20(%edi) ) SRC( movl 24(%esi), %ebx ) SRC( movl 28(%esi), %edx ) adcl %ebx, %eax -DST( movl %ebx, 24(%edi) ) +DST( movl %ebx, %es:24(%edi) ) adcl %edx, %eax -DST( movl %edx, 28(%edi) ) +DST( movl %edx, %es:28(%edi) ) lea 32(%esi), %esi lea 32(%edi), %edi @@ -354,7 +366,7 @@ DST( movl %edx, 28(%edi) ) shrl $2, %edx # This clears CF SRC(3: movl (%esi), %ebx ) adcl %ebx, %eax -DST( movl %ebx, (%edi) ) +DST( movl %ebx, %es:(%edi) ) lea 4(%esi), %esi lea 4(%edi), %edi dec %edx @@ -366,12 +378,12 @@ DST( movl %ebx, (%edi) ) jb 5f SRC( movw (%esi), %cx ) leal 2(%esi), %esi -DST( movw %cx, (%edi) ) +DST( movw %cx, %es:(%edi) ) leal 2(%edi), %edi je 6f shll $16,%ecx SRC(5: movb (%esi), %cl ) -DST( movb %cl, (%edi) ) +DST( movb %cl, %es:(%edi) ) 6: addl %ecx, %eax adcl $0, %eax 7: @@ -382,7 +394,7 @@ DST( movb %cl, (%edi) ) 6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr - movl $-EFAULT, (%ebx) + movl $-EFAULT, %ss:(%ebx) # zero the complete destination - computing the rest # is too much work @@ -395,11 +407,15 @@ DST( movb %cl, (%edi) ) 6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr - movl $-EFAULT,(%ebx) + movl $-EFAULT,%ss:(%ebx) jmp 5000b .previous + pushl %ss + popl %ds + pushl %ss + popl %es popl %ebx popl %esi popl %edi @@ -411,17 +427,28 @@ DST( movb %cl, (%edi) ) /* Version for PentiumII/PPro */ #define ROUND1(x) \ + nop; nop; nop; \ SRC(movl x(%esi), %ebx ) ; \ addl %ebx, %eax ; \ - DST(movl %ebx, x(%edi) ) ; + DST(movl %ebx, %es:x(%edi)); #define ROUND(x) \ + nop; nop; nop; \ SRC(movl x(%esi), %ebx ) ; \ adcl %ebx, %eax ; \ - DST(movl %ebx, x(%edi) ) ; + DST(movl %ebx, %es:x(%edi)); #define ARGBASE 12 - + +csum_partial_copy_generic_to_user: + pushl $(__USER_DS) + popl %es + jmp csum_partial_copy_generic + +csum_partial_copy_generic_from_user: + pushl $(__USER_DS) + popl %ds + csum_partial_copy_generic: pushl %ebx pushl %edi @@ -440,7 +467,7 @@ csum_partial_copy_generic: subl %ebx, %edi lea -1(%esi),%edx andl $-32,%edx - lea 3f(%ebx,%ebx), %ebx + lea 3f(%ebx,%ebx,2), %ebx testl %esi, %esi jmp *%ebx 1: addl $64,%esi @@ -461,19 +488,19 @@ csum_partial_copy_generic: jb 5f SRC( movw (%esi), %dx ) leal 2(%esi), %esi -DST( movw %dx, (%edi) ) +DST( movw %dx, %es:(%edi) ) leal 2(%edi), %edi je 6f shll $16,%edx 5: SRC( movb (%esi), %dl ) -DST( movb %dl, (%edi) ) +DST( movb %dl, %es:(%edi) ) 6: addl %edx, %eax adcl $0, %eax 7: .section .fixup, "ax" 6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr - movl $-EFAULT, (%ebx) + movl $-EFAULT, %ss:(%ebx) # zero the complete destination (computing the rest is too much work) movl ARGBASE+8(%esp),%edi # dst movl ARGBASE+12(%esp),%ecx # len @@ -481,10 +508,14 @@ DST( movb %dl, (%edi) ) rep; stosb jmp 7b 6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr - movl $-EFAULT, (%ebx) + movl $-EFAULT, %ss:(%ebx) jmp 7b .previous + pushl %ss + popl %ds + pushl %ss + popl %es popl %esi popl %edi popl %ebx diff -NurpX nopatch linux-2.4.36.6/arch/i386/lib/getuser.S linux-2.4.36.6-pax/arch/i386/lib/getuser.S --- linux-2.4.36.6/arch/i386/lib/getuser.S 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/lib/getuser.S 2008-05-08 12:41:32.000000000 +0200 @@ -9,6 +9,8 @@ * return value. */ +#include + /* * __get_user_X * @@ -31,7 +33,11 @@ __get_user_1: andl $0xffffe000,%edx cmpl addr_limit(%edx),%eax jae bad_get_user + pushl $(__USER_DS) + popl %ds 1: movzbl (%eax),%edx + pushl %ss + pop %ds xorl %eax,%eax ret @@ -44,7 +50,11 @@ __get_user_2: andl $0xffffe000,%edx cmpl addr_limit(%edx),%eax jae bad_get_user + pushl $(__USER_DS) + popl %ds 2: movzwl -1(%eax),%edx + pushl %ss + pop %ds xorl %eax,%eax ret @@ -57,11 +67,17 @@ __get_user_4: andl $0xffffe000,%edx cmpl addr_limit(%edx),%eax jae bad_get_user + pushl $(__USER_DS) + popl %ds 3: movl -3(%eax),%edx + pushl %ss + pop %ds xorl %eax,%eax ret bad_get_user: + pushl %ss + pop %ds xorl %edx,%edx movl $-14,%eax ret diff -NurpX nopatch linux-2.4.36.6/arch/i386/lib/mmx.c linux-2.4.36.6-pax/arch/i386/lib/mmx.c --- linux-2.4.36.6/arch/i386/lib/mmx.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/lib/mmx.c 2008-05-08 12:41:32.000000000 +0200 @@ -30,6 +30,7 @@ void *_mmx_memcpy(void *to, const void * { void *p; int i; + unsigned long cr0; if (in_interrupt()) return __memcpy(to, from, len); @@ -40,52 +41,80 @@ void *_mmx_memcpy(void *to, const void * kernel_fpu_begin(); __asm__ __volatile__ ( - "1: prefetch (%0)\n" /* This set is 28 bytes */ - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" + "1: prefetch (%1)\n" /* This set is 28 bytes */ + " prefetch 64(%1)\n" + " prefetch 128(%1)\n" + " prefetch 192(%1)\n" + " prefetch 256(%1)\n" "2: \n" ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + "3: \n" + +#ifdef CONFIG_PAX_KERNEXEC + " movl %%cr0, %0\n" + " movl %0, %%eax\n" + " andl $0xFFFEFFFF, %%eax\n" + " movl %%eax, %%cr0\n" +#endif + + " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + +#ifdef CONFIG_PAX_KERNEXEC + " movl %0, %%cr0\n" +#endif + " jmp 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" " .align 4\n" " .long 1b, 3b\n" ".previous" - : : "r" (from) ); + : "=&r" (cr0) : "r" (from) : "ax"); for(; i>5; i--) { __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movq 8(%0), %%mm1\n" - " movq 16(%0), %%mm2\n" - " movq 24(%0), %%mm3\n" - " movq %%mm0, (%1)\n" - " movq %%mm1, 8(%1)\n" - " movq %%mm2, 16(%1)\n" - " movq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm0\n" - " movq 40(%0), %%mm1\n" - " movq 48(%0), %%mm2\n" - " movq 56(%0), %%mm3\n" - " movq %%mm0, 32(%1)\n" - " movq %%mm1, 40(%1)\n" - " movq %%mm2, 48(%1)\n" - " movq %%mm3, 56(%1)\n" + "1: prefetch 320(%1)\n" + "2: movq (%1), %%mm0\n" + " movq 8(%1), %%mm1\n" + " movq 16(%1), %%mm2\n" + " movq 24(%1), %%mm3\n" + " movq %%mm0, (%2)\n" + " movq %%mm1, 8(%2)\n" + " movq %%mm2, 16(%2)\n" + " movq %%mm3, 24(%2)\n" + " movq 32(%1), %%mm0\n" + " movq 40(%1), %%mm1\n" + " movq 48(%1), %%mm2\n" + " movq 56(%1), %%mm3\n" + " movq %%mm0, 32(%2)\n" + " movq %%mm1, 40(%2)\n" + " movq %%mm2, 48(%2)\n" + " movq %%mm3, 56(%2)\n" ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + "3:\n" + +#ifdef CONFIG_PAX_KERNEXEC + " movl %%cr0, %0\n" + " movl %0, %%eax\n" + " andl $0xFFFEFFFF, %%eax\n" + " movl %%eax, %%cr0\n" +#endif + + " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + +#ifdef CONFIG_PAX_KERNEXEC + " movl %0, %%cr0\n" +#endif + " jmp 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" " .align 4\n" " .long 1b, 3b\n" ".previous" - : : "r" (from), "r" (to) : "memory"); + : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); from+=64; to+=64; } @@ -164,6 +193,7 @@ static void fast_clear_page(void *page) static void fast_copy_page(void *to, void *from) { int i; + unsigned long cr0; kernel_fpu_begin(); @@ -171,51 +201,79 @@ static void fast_copy_page(void *to, voi * but that is for later. -AV */ __asm__ __volatile__ ( - "1: prefetch (%0)\n" - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" + "1: prefetch (%1)\n" + " prefetch 64(%1)\n" + " prefetch 128(%1)\n" + " prefetch 192(%1)\n" + " prefetch 256(%1)\n" "2: \n" ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + "3: \n" + +#ifdef CONFIG_PAX_KERNEXEC + " movl %%cr0, %0\n" + " movl %0, %%eax\n" + " andl $0xFFFEFFFF, %%eax\n" + " movl %%eax, %%cr0\n" +#endif + + " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + +#ifdef CONFIG_PAX_KERNEXEC + " movl %0, %%cr0\n" +#endif + " jmp 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" " .align 4\n" " .long 1b, 3b\n" ".previous" - : : "r" (from) ); + : "=&r" (cr0) : "r" (from) : "ax"); for(i=0; i<(4096-320)/64; i++) { __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movntq %%mm0, (%1)\n" - " movq 8(%0), %%mm1\n" - " movntq %%mm1, 8(%1)\n" - " movq 16(%0), %%mm2\n" - " movntq %%mm2, 16(%1)\n" - " movq 24(%0), %%mm3\n" - " movntq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm4\n" - " movntq %%mm4, 32(%1)\n" - " movq 40(%0), %%mm5\n" - " movntq %%mm5, 40(%1)\n" - " movq 48(%0), %%mm6\n" - " movntq %%mm6, 48(%1)\n" - " movq 56(%0), %%mm7\n" - " movntq %%mm7, 56(%1)\n" + "1: prefetch 320(%1)\n" + "2: movq (%1), %%mm0\n" + " movntq %%mm0, (%2)\n" + " movq 8(%1), %%mm1\n" + " movntq %%mm1, 8(%2)\n" + " movq 16(%1), %%mm2\n" + " movntq %%mm2, 16(%2)\n" + " movq 24(%1), %%mm3\n" + " movntq %%mm3, 24(%2)\n" + " movq 32(%1), %%mm4\n" + " movntq %%mm4, 32(%2)\n" + " movq 40(%1), %%mm5\n" + " movntq %%mm5, 40(%2)\n" + " movq 48(%1), %%mm6\n" + " movntq %%mm6, 48(%2)\n" + " movq 56(%1), %%mm7\n" + " movntq %%mm7, 56(%2)\n" ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + "3:\n" + +#ifdef CONFIG_PAX_KERNEXEC + " movl %%cr0, %0\n" + " movl %0, %%eax\n" + " andl $0xFFFEFFFF, %%eax\n" + " movl %%eax, %%cr0\n" +#endif + + " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + +#ifdef CONFIG_PAX_KERNEXEC + " movl %0, %%cr0\n" +#endif + " jmp 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" " .align 4\n" " .long 1b, 3b\n" ".previous" - : : "r" (from), "r" (to) : "memory"); + : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); from+=64; to+=64; } @@ -296,56 +354,84 @@ static void fast_clear_page(void *page) static void fast_copy_page(void *to, void *from) { int i; - - + unsigned long cr0; + kernel_fpu_begin(); __asm__ __volatile__ ( - "1: prefetch (%0)\n" - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" + "1: prefetch (%1)\n" + " prefetch 64(%1)\n" + " prefetch 128(%1)\n" + " prefetch 192(%1)\n" + " prefetch 256(%1)\n" "2: \n" ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + "3: \n" + +#ifdef CONFIG_PAX_KERNEXEC + " movl %%cr0, %0\n" + " movl %0, %%eax\n" + " andl $0xFFFEFFFF, %%eax\n" + " movl %%eax, %%cr0\n" +#endif + + " movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + +#ifdef CONFIG_PAX_KERNEXEC + " movl %0, %%cr0\n" +#endif + " jmp 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" " .align 4\n" " .long 1b, 3b\n" ".previous" - : : "r" (from) ); + : "=&r" (cr0) : "r" (from) : "ax"); for(i=0; i<4096/64; i++) { __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movq 8(%0), %%mm1\n" - " movq 16(%0), %%mm2\n" - " movq 24(%0), %%mm3\n" - " movq %%mm0, (%1)\n" - " movq %%mm1, 8(%1)\n" - " movq %%mm2, 16(%1)\n" - " movq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm0\n" - " movq 40(%0), %%mm1\n" - " movq 48(%0), %%mm2\n" - " movq 56(%0), %%mm3\n" - " movq %%mm0, 32(%1)\n" - " movq %%mm1, 40(%1)\n" - " movq %%mm2, 48(%1)\n" - " movq %%mm3, 56(%1)\n" + "1: prefetch 320(%1)\n" + "2: movq (%1), %%mm0\n" + " movq 8(%1), %%mm1\n" + " movq 16(%1), %%mm2\n" + " movq 24(%1), %%mm3\n" + " movq %%mm0, (%2)\n" + " movq %%mm1, 8(%2)\n" + " movq %%mm2, 16(%2)\n" + " movq %%mm3, 24(%2)\n" + " movq 32(%1), %%mm0\n" + " movq 40(%1), %%mm1\n" + " movq 48(%1), %%mm2\n" + " movq 56(%1), %%mm3\n" + " movq %%mm0, 32(%2)\n" + " movq %%mm1, 40(%2)\n" + " movq %%mm2, 48(%2)\n" + " movq %%mm3, 56(%2)\n" ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + "3:\n" + +#ifdef CONFIG_PAX_KERNEXEC + " movl %%cr0, %0\n" + " movl %0, %%eax\n" + " andl $0xFFFEFFFF, %%eax\n" + " movl %%eax, %%cr0\n" +#endif + + " movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + +#ifdef CONFIG_PAX_KERNEXEC + " movl %0, %%cr0\n" +#endif + " jmp 2b\n" ".previous\n" ".section __ex_table,\"a\"\n" " .align 4\n" " .long 1b, 3b\n" ".previous" - : : "r" (from), "r" (to) : "memory"); + : "=&r" (cr0) : "r" (from), "r" (to) : "memory", "ax"); from+=64; to+=64; } diff -NurpX nopatch linux-2.4.36.6/arch/i386/lib/usercopy.c linux-2.4.36.6-pax/arch/i386/lib/usercopy.c --- linux-2.4.36.6/arch/i386/lib/usercopy.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/lib/usercopy.c 2008-05-08 12:41:32.000000000 +0200 @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_X86_USE_3DNOW_AND_WORKS @@ -75,6 +76,11 @@ __generic_copy_from_user(void *to, const do { \ int __d0, __d1, __d2; \ __asm__ __volatile__( \ + " movw %w0,%%ds\n" \ + : \ + : "r"(__USER_DS) \ + : "memory"); \ + __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ "0: lodsb\n" \ @@ -85,6 +91,8 @@ do { \ " jnz 0b\n" \ "1: subl %1,%0\n" \ "2:\n" \ + " pushl %%ss\n" \ + " popl %%ds\n" \ ".section .fixup,\"ax\"\n" \ "3: movl %5,%0\n" \ " jmp 2b\n" \ @@ -163,10 +171,13 @@ strncpy_from_user(char *dst, const char do { \ int __d0; \ __asm__ __volatile__( \ + " movw %w6,%%es\n" \ "0: rep; stosl\n" \ " movl %2,%0\n" \ "1: rep; stosb\n" \ "2:\n" \ + " pushl %%ss\n" \ + " popl %%es\n" \ ".section .fixup,\"ax\"\n" \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ @@ -177,7 +188,8 @@ do { \ " .long 1b,2b\n" \ ".previous" \ : "=&c"(size), "=&D" (__d0) \ - : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ + : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0), \ + "r"(__USER_DS)); \ } while (0) /** @@ -233,6 +245,7 @@ long strnlen_user(const char *s, long n) unsigned long res, tmp; __asm__ __volatile__( + " movw %w8,%%es\n" " testl %0, %0\n" " jz 3f\n" " andl %0,%%ecx\n" @@ -241,6 +254,8 @@ long strnlen_user(const char *s, long n) " subl %%ecx,%0\n" " addl %0,%%eax\n" "1:\n" + " pushl %%ss\n" + " popl %%es\n" ".section .fixup,\"ax\"\n" "2: xorl %%eax,%%eax\n" " jmp 1b\n" @@ -252,7 +267,7 @@ long strnlen_user(const char *s, long n) " .long 0b,2b\n" ".previous" :"=r" (n), "=D" (s), "=a" (res), "=c" (tmp) - :"0" (n), "1" (s), "2" (0), "3" (mask) + :"0" (n), "1" (s), "2" (0), "3" (mask), "r" (__USER_DS) :"cc"); return res & mask; } diff -NurpX nopatch linux-2.4.36.6/arch/i386/mm/fault.c linux-2.4.36.6-pax/arch/i386/mm/fault.c --- linux-2.4.36.6/arch/i386/mm/fault.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/mm/fault.c 2008-05-08 12:41:32.000000000 +0200 @@ -19,6 +19,8 @@ #include #include #include /* For unblank_screen() */ +#include +#include #include #include @@ -78,6 +80,12 @@ good_area: check_stack: if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; + +#ifdef CONFIG_PAX_SEGMEXEC + if ((vma->vm_mm->pax_flags & MF_PAX_SEGMEXEC) && vma->vm_end - SEGMEXEC_TASK_SIZE - 1 < start - SEGMEXEC_TASK_SIZE - 1) + goto bad_area; +#endif + if (expand_stack(vma, start) == 0) goto good_area; @@ -125,7 +133,10 @@ void bust_spinlocks(int yes) } asmlinkage void do_invalid_op(struct pt_regs *, unsigned long); -extern unsigned long idt; + +#ifdef CONFIG_PAX_EMUTRAMP +static int pax_handle_fetch_fault(struct pt_regs *regs); +#endif /* * This routine handles page faults. It determines the address, @@ -137,23 +148,31 @@ extern unsigned long idt; * bit 1 == 0 means read, 1 means write * bit 2 == 0 means kernel, 1 means user-mode */ -asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) + +#ifdef CONFIG_PAX_PAGEEXEC +static int do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +#else +asmlinkage int do_page_fault(struct pt_regs *regs, unsigned long error_code) +#endif { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; +#ifndef CONFIG_PAX_PAGEEXEC unsigned long address; - unsigned long page; +#endif unsigned long fixup; int write; siginfo_t info; +#ifndef CONFIG_PAX_PAGEEXEC /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); +#endif tsk = current; @@ -202,6 +221,12 @@ asmlinkage void do_page_fault(struct pt_ if (address + 32 < regs->esp) goto bad_area; } + +#ifdef CONFIG_PAX_SEGMEXEC + if ((mm->pax_flags & MF_PAX_SEGMEXEC) && vma->vm_end - SEGMEXEC_TASK_SIZE - 1 < address - SEGMEXEC_TASK_SIZE - 1) + goto bad_area; +#endif + if (expand_stack(vma, address)) goto bad_area; /* @@ -258,7 +283,7 @@ good_area: tsk->thread.screen_bitmap |= 1 << bit; } up_read(&mm->mmap_sem); - return; + return 0; /* * Something tried to access memory that isn't in our memory map.. @@ -267,6 +292,38 @@ good_area: bad_area: up_read(&mm->mmap_sem); +#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) + if ((error_code & 4) && !(regs->eflags & X86_EFLAGS_VM)) { + +#ifdef CONFIG_PAX_PAGEEXEC + if ((mm->pax_flags & MF_PAX_PAGEEXEC) && !(error_code & 3) && (regs->eip == address)) { + pax_report_fault(regs, (void*)regs->eip, (void*)regs->esp); + do_exit(SIGKILL); + } +#endif + +#ifdef CONFIG_PAX_SEGMEXEC + if ((mm->pax_flags & MF_PAX_SEGMEXEC) && !(error_code & 3) && (regs->eip + SEGMEXEC_TASK_SIZE == address)) { + +#ifdef CONFIG_PAX_EMUTRAMP + switch (pax_handle_fetch_fault(regs)) { + case 4: + return 0; + + case 3: + case 2: + return 1; + } +#endif + + pax_report_fault(regs, (void*)regs->eip, (void*)regs->esp); + do_exit(SIGKILL); + } +#endif + + } +#endif + /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { tsk->thread.cr2 = address; @@ -278,7 +335,7 @@ bad_area: /* info.si_code has been set above */ info.si_addr = (void *)address; force_sig_info(SIGSEGV, &info, tsk); - return; + return 0; } /* @@ -287,11 +344,11 @@ bad_area: if (boot_cpu_data.f00f_bug) { unsigned long nr; - nr = (address - idt) >> 3; + nr = (address - (unsigned long)idt) >> 3; if (nr == 6) { do_invalid_op(regs, 0); - return; + return 0; } } @@ -299,7 +356,7 @@ no_context: /* Are we prepared to handle this kernel fault? */ if ((fixup = search_exception_table(regs->eip)) != 0) { regs->eip = fixup; - return; + return 0; } /* @@ -311,19 +368,36 @@ no_context: if (address < PAGE_SIZE) printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + +#ifdef CONFIG_PAX_KERNEXEC + else if (init_mm.start_code <= address && address < init_mm.end_code) + printk(KERN_ERR "PAX: %s:%d, uid/euid: %u/%u, attempted to modify kernel code", + tsk->comm, tsk->pid, tsk->uid, tsk->euid); +#endif + else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); printk(" printing eip:\n"); printk("%08lx\n", regs->eip); - asm("movl %%cr3,%0":"=r" (page)); - page = ((unsigned long *) __va(page))[address >> 22]; - printk(KERN_ALERT "*pde = %08lx\n", page); - if (page & 1) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); + { + unsigned long index = pgd_index(address); + unsigned long pgd_paddr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + asm("movl %%cr3,%0":"=r" (pgd_paddr)); + pgd = index + (pgd_t *)__va(pgd_paddr); + printk(KERN_ALERT "*pgd = %*llx\n", sizeof(*pgd), (unsigned long long)pgd_val(*pgd)); + if (pgd_present(*pgd)) { + pmd = pmd_offset(pgd, address); + printk(KERN_ALERT "*pmd = %*llx\n", sizeof(*pmd), (unsigned long long)pmd_val(*pmd)); + if (pmd_present(*pmd) && !(pmd_val(*pmd) & _PAGE_PSE)) { + pte = pte_offset(pmd, address); + printk(KERN_ALERT "*pte = %*llx\n", sizeof(*pte), (unsigned long long)pte_val(*pte)); + } + } } die("Oops", regs, error_code); bust_spinlocks(0); @@ -363,7 +437,7 @@ do_sigbus: /* Kernel mode? Handle exceptions or die */ if (!(error_code & 4)) goto no_context; - return; + return 0; vmalloc_fault: { @@ -396,6 +470,333 @@ vmalloc_fault: pte_k = pte_offset(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; - return; + return 0; + } +} + +#ifdef CONFIG_PAX_PAGEEXEC +/* PaX: called with the page_table_lock spinlock held */ +static inline pte_t * pax_get_pte(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pmd_t *pmd; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return NULL; + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + return NULL; + return pte_offset(pmd, address); +} +#endif + +#ifdef CONFIG_PAX_EMUTRAMP +/* + * PaX: decide what to do with offenders (regs->eip = fault address) + * + * returns 1 when task should be killed + * 2 when sigreturn trampoline was detected + * 3 when rt_sigreturn trampoline was detected + * 4 when gcc trampoline was detected + */ +static int pax_handle_fetch_fault(struct pt_regs *regs) +{ + static const unsigned char trans[8] = { + offsetof(struct pt_regs, eax) / 4, + offsetof(struct pt_regs, ecx) / 4, + offsetof(struct pt_regs, edx) / 4, + offsetof(struct pt_regs, ebx) / 4, + offsetof(struct pt_regs, esp) / 4, + offsetof(struct pt_regs, ebp) / 4, + offsetof(struct pt_regs, esi) / 4, + offsetof(struct pt_regs, edi) / 4, + }; + int err; + + if (regs->eflags & X86_EFLAGS_VM) + return 1; + +#ifndef CONFIG_PAX_EMUSIGRT + if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP)) + return 1; +#endif + + do { /* PaX: sigreturn emulation */ + unsigned char pop, mov; + unsigned short sys; + unsigned long nr; + + err = get_user(pop, (unsigned char *)(regs->eip)); + err |= get_user(mov, (unsigned char *)(regs->eip + 1)); + err |= get_user(nr, (unsigned long *)(regs->eip + 2)); + err |= get_user(sys, (unsigned short *)(regs->eip + 6)); + + if (err) + break; + + if (pop == 0x58 && + mov == 0xb8 && + nr == __NR_sigreturn && + sys == 0x80cd) + { + +#ifdef CONFIG_PAX_EMUSIGRT + int sig; + struct k_sigaction *ka; + __sighandler_t handler; + + if (get_user(sig, (int *)regs->esp)) + return 1; + if (sig < 1 || sig > _NSIG || sig == SIGKILL || sig == SIGSTOP) + return 1; + spin_lock_irq(¤t->sigmask_lock); + ka = ¤t->sig->action[sig-1]; + handler = ka->sa.sa_handler; + if (handler == SIG_DFL || handler == SIG_IGN) { + if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP)) + err = 1; + } else if (ka->sa.sa_flags & SA_SIGINFO) + err = 1; + spin_unlock_irq(¤t->sigmask_lock); + if (err) + return 1; +#endif + + regs->esp += 4; + regs->eax = nr; + regs->eip += 8; + return 2; + } + } while (0); + + do { /* PaX: rt_sigreturn emulation */ + unsigned char mov; + unsigned short sys; + unsigned long nr; + + err = get_user(mov, (unsigned char *)(regs->eip)); + err |= get_user(nr, (unsigned long *)(regs->eip + 1)); + err |= get_user(sys, (unsigned short *)(regs->eip + 5)); + + if (err) + break; + + if (mov == 0xb8 && + nr == __NR_rt_sigreturn && + sys == 0x80cd) + { + +#ifdef CONFIG_PAX_EMUSIGRT + int sig; + struct k_sigaction *ka; + __sighandler_t handler; + + if (get_user(sig, (int *)regs->esp)) + return 1; + if (sig < 1 || sig > _NSIG || sig == SIGKILL || sig == SIGSTOP) + return 1; + spin_lock_irq(¤t->sigmask_lock); + ka = ¤t->sig->action[sig-1]; + handler = ka->sa.sa_handler; + if (handler == SIG_DFL || handler == SIG_IGN) { + if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP)) + err = 1; + } else if (!(ka->sa.sa_flags & SA_SIGINFO)) + err = 1; + spin_unlock_irq(¤t->sigmask_lock); + if (err) + return 1; +#endif + + regs->eax = nr; + regs->eip += 7; + return 3; + } + } while (0); + +#ifdef CONFIG_PAX_EMUSIGRT + if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP)) + return 1; +#endif + + do { /* PaX: gcc trampoline emulation #1 */ + unsigned char mov1, mov2; + unsigned short jmp; + unsigned long addr1, addr2; + + err = get_user(mov1, (unsigned char *)regs->eip); + err |= get_user(addr1, (unsigned long *)(regs->eip + 1)); + err |= get_user(mov2, (unsigned char *)(regs->eip + 5)); + err |= get_user(addr2, (unsigned long *)(regs->eip + 6)); + err |= get_user(jmp, (unsigned short *)(regs->eip + 10)); + + if (err) + break; + + if ((mov1 & 0xF8) == 0xB8 && + (mov2 & 0xF8) == 0xB8 && + (mov1 & 0x07) != (mov2 & 0x07) && + (jmp & 0xF8FF) == 0xE0FF && + (mov2 & 0x07) == ((jmp>>8) & 0x07)) + { + ((unsigned long *)regs)[trans[mov1 & 0x07]] = addr1; + ((unsigned long *)regs)[trans[mov2 & 0x07]] = addr2; + regs->eip = addr2; + return 4; + } + } while (0); + + do { /* PaX: gcc trampoline emulation #2 */ + unsigned char mov, jmp; + unsigned long addr1, addr2; + + err = get_user(mov, (unsigned char *)regs->eip); + err |= get_user(addr1, (unsigned long *)(regs->eip + 1)); + err |= get_user(jmp, (unsigned char *)(regs->eip + 5)); + err |= get_user(addr2, (unsigned long *)(regs->eip + 6)); + + if (err) + break; + + if ((mov & 0xF8) == 0xB8 && + jmp == 0xE9) + { + ((unsigned long *)regs)[trans[mov & 0x07]] = addr1; + regs->eip += addr2 + 10; + return 4; + } + } while (0); + + return 1; /* PaX in action */ +} +#endif + +#if defined(CONFIG_PAX_PAGEEXEC) || defined(CONFIG_PAX_SEGMEXEC) +void pax_report_insns(void *pc, void *sp) +{ + long i; + + printk(KERN_ERR "PAX: bytes at PC: "); + for (i = 0; i < 20; i++) { + unsigned char c; + if (get_user(c, (unsigned char*)pc+i)) + printk("?? "); + else + printk("%02x ", c); + } + printk("\n"); + + printk(KERN_ERR "PAX: bytes at SP-4: "); + for (i = -1; i < 20; i++) { + unsigned long c; + if (get_user(c, (unsigned long*)sp+i)) + printk("???????? "); + else + printk("%08lx ", c); + } + printk("\n"); +} +#endif + +#ifdef CONFIG_PAX_PAGEEXEC +/* + * PaX: handle the extra page faults or pass it down to the original handler + * + * returns 0 when nothing special was detected + * 1 when sigreturn trampoline (syscall) has to be emulated + */ +asmlinkage int pax_do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct mm_struct *mm = current->mm; + unsigned long address; + pte_t *pte; + unsigned char pte_mask; + + __asm__("movl %%cr2,%0":"=r" (address)); + + /* It's safe to allow irq's after cr2 has been saved */ + if (likely(regs->eflags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely((error_code & 5) != 5 || + address >= TASK_SIZE || + (regs->eflags & X86_EFLAGS_VM) || + !mm || !(mm->pax_flags & MF_PAX_PAGEEXEC))) + return do_page_fault(regs, error_code, address); + + /* PaX: it's our fault, let's handle it if we can */ + + /* PaX: take a look at read faults before acquiring any locks */ + if (unlikely(!(error_code & 2) && (regs->eip == address))) { + /* instruction fetch attempt from a protected page in user mode */ + +#ifdef CONFIG_PAX_EMUTRAMP + switch (pax_handle_fetch_fault(regs)) { + case 4: + return 0; + + case 3: + case 2: + return 1; + } +#endif + + pax_report_fault(regs, (void*)regs->eip, (void*)regs->esp); + do_exit(SIGKILL); + } + + pte_mask = _PAGE_ACCESSED | _PAGE_USER | ((error_code & 2) << (_PAGE_BIT_DIRTY-1)); + + spin_lock(&mm->page_table_lock); + pte = pax_get_pte(mm, address); + if (unlikely(!pte || !(pte_val(*pte) & _PAGE_PRESENT) || pte_exec(*pte))) { + spin_unlock(&mm->page_table_lock); + do_page_fault(regs, error_code, address); + return 0; } + + if (unlikely((error_code & 2) && !pte_write(*pte))) { + /* write attempt to a protected page in user mode */ + spin_unlock(&mm->page_table_lock); + do_page_fault(regs, error_code, address); + return 0; + } + + /* + * PaX: fill DTLB with user rights and retry + */ + __asm__ __volatile__ ( +#ifdef CONFIG_PAX_MEMORY_UDEREF + "movw %w4,%%es\n" +#endif + "orb %2,(%1)\n" +#if defined(CONFIG_M586) || defined(CONFIG_M586TSC) +/* + * PaX: let this uncommented 'invlpg' remind us on the behaviour of Intel's + * (and AMD's) TLBs. namely, they do not cache PTEs that would raise *any* + * page fault when examined during a TLB load attempt. this is true not only + * for PTEs holding a non-present entry but also present entries that will + * raise a page fault (such as those set up by PaX, or the copy-on-write + * mechanism). in effect it means that we do *not* need to flush the TLBs + * for our target pages since their PTEs are simply not in the TLBs at all. + + * the best thing in omitting it is that we gain around 15-20% speed in the + * fast path of the page fault handler and can get rid of tracing since we + * can no longer flush unintended entries. + */ + "invlpg (%0)\n" +#endif + "testb $0,%%es:(%0)\n" + "xorb %3,(%1)\n" +#ifdef CONFIG_PAX_MEMORY_UDEREF + "pushl %%ss\n" + "popl %%es\n" +#endif + : + : "q" (address), "r" (pte), "q" (pte_mask), "i" (_PAGE_USER), "r" (__USER_DS) + : "memory", "cc"); + spin_unlock(&mm->page_table_lock); + return 0; } +#endif diff -NurpX nopatch linux-2.4.36.6/arch/i386/mm/init.c linux-2.4.36.6-pax/arch/i386/mm/init.c --- linux-2.4.36.6/arch/i386/mm/init.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/mm/init.c 2008-05-08 12:41:32.000000000 +0200 @@ -37,6 +37,7 @@ #include #include #include +#include mmu_gather_t mmu_gathers[NR_CPUS]; unsigned long highstart_pfn, highend_pfn; @@ -122,7 +123,7 @@ void show_mem(void) /* References to section boundaries */ -extern char _text, _etext, _edata, __bss_start, _end; +extern char _text, _etext, _data, _edata, __bss_start, _end; extern char __init_begin, __init_end; static inline void set_pte_phys (unsigned long vaddr, @@ -178,17 +179,7 @@ static void __init fixrange_init (unsign pgd = pgd_base + i; for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) { -#if CONFIG_X86_PAE - if (pgd_none(*pgd)) { - pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd) + 0x1)); - if (pmd != pmd_offset(pgd, 0)) - printk("PAE BUG #02!\n"); - } pmd = pmd_offset(pgd, vaddr); -#else - pmd = (pmd_t *)pgd; -#endif for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) { if (pmd_none(*pmd)) { pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); @@ -217,25 +208,22 @@ static void __init pagetable_init (void) end = (unsigned long)__va(max_low_pfn*PAGE_SIZE); pgd_base = swapper_pg_dir; -#if CONFIG_X86_PAE - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(1 + __pa(empty_zero_page))); -#endif i = __pgd_offset(PAGE_OFFSET); pgd = pgd_base + i; + if (cpu_has_pse) { + set_in_cr4(X86_CR4_PSE); + boot_cpu_data.wp_works_ok = 1; + + if (cpu_has_pge) + set_in_cr4(X86_CR4_PGE); + } + for (; i < PTRS_PER_PGD; pgd++, i++) { vaddr = i*PGDIR_SIZE; if (end && (vaddr >= end)) break; -#if CONFIG_X86_PAE - pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd) + 0x1)); -#else - pmd = (pmd_t *)pgd; -#endif - if (pmd != pmd_offset(pgd, 0)) - BUG(); + pmd = pmd_offset(pgd, PAGE_OFFSET); for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { vaddr = i*PGDIR_SIZE + j*PMD_SIZE; if (end && (vaddr >= end)) @@ -243,14 +231,16 @@ static void __init pagetable_init (void) if (cpu_has_pse) { unsigned long __pe; - set_in_cr4(X86_CR4_PSE); - boot_cpu_data.wp_works_ok = 1; __pe = _KERNPG_TABLE + _PAGE_PSE + __pa(vaddr); /* Make it "global" too if supported */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); + if (cpu_has_pge) __pe += _PAGE_GLOBAL; - } + +#ifdef CONFIG_PAX_KERNEXEC + if (__KERNEL_TEXT_OFFSET <= vaddr && vaddr < (unsigned long)&_data) + __pe &= ~_PAGE_RW; +#endif + set_pmd(pmd, __pmd(__pe)); continue; } @@ -263,6 +253,13 @@ static void __init pagetable_init (void) break; *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL); } + +#ifdef CONFIG_PAX_KERNEXEC + if (__KERNEL_TEXT_OFFSET <= vaddr && vaddr < (unsigned long)&_data) + set_pmd(pmd, __pmd((_KERNPG_TABLE & ~_PAGE_RW) + __pa(pte_base))); + else +#endif + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); if (pte_base != pte_offset(pmd, 0)) BUG(); @@ -289,17 +286,6 @@ static void __init pagetable_init (void) pte = pte_offset(pmd, vaddr); pkmap_page_table = pte; #endif - -#if CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; -#endif } void __init zap_low_mappings (void) @@ -312,7 +298,7 @@ void __init zap_low_mappings (void) * us, because pgd_clear() is a no-op on i386. */ for (i = 0; i < USER_PTRS_PER_PGD; i++) -#if CONFIG_X86_PAE +#ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else set_pgd(swapper_pg_dir+i, __pgd(0)); @@ -353,16 +339,6 @@ void __init paging_init(void) pagetable_init(); load_cr3(swapper_pg_dir); - -#if CONFIG_X86_PAE - /* - * We will bail out later - printk doesn't work right now so - * the user would just see a hanging kernel. - */ - if (cpu_has_pae) - set_in_cr4(X86_CR4_PAE); -#endif - __flush_tlb_all(); #ifdef CONFIG_HIGHMEM @@ -508,6 +484,10 @@ void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; +#endif + if (!mem_map) BUG(); #ifdef CONFIG_HIGHMEM @@ -524,12 +504,21 @@ void __init mem_init(void) high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); /* clear the zero-page */ + +#ifdef CONFIG_PAX_KERNEXEC + pax_open_kernel(cr0); +#endif + memset(empty_zero_page, 0, PAGE_SIZE); +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + reservedpages = free_pages_init(); codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; + datasize = (unsigned long) &_edata - (unsigned long) &_data; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", @@ -542,10 +531,6 @@ void __init mem_init(void) (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); -#if CONFIG_X86_PAE - if (!cpu_has_pae) - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); -#endif if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); @@ -589,6 +574,26 @@ void free_initmem(void) { unsigned long addr; +#ifdef CONFIG_PAX_KERNEXEC + /* PaX: limit KERNEL_CS to actual size */ + unsigned long limit, cr0; + + limit = (unsigned long)&_etext >> PAGE_SHIFT; + + pax_open_kernel(cr0); + + gdt_table[2].a = (gdt_table[2].a & 0xFFFF0000UL) | (limit & 0x0FFFFUL); + gdt_table[2].b = (gdt_table[2].b & 0xFFF0FFFFUL) | (limit & 0xF0000UL); + +#ifdef CONFIG_PAX_SEGMEXEC + gdt_table2[2].a = (gdt_table2[2].a & 0xFFFF0000UL) | (limit & 0x0FFFFUL); + gdt_table2[2].b = (gdt_table2[2].b & 0xFFF0FFFFUL) | (limit & 0xF0000UL); +#endif + + pax_close_kernel(cr0); +#endif + + memset(&__init_begin, 0, &__init_end - &__init_begin); addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); diff -NurpX nopatch linux-2.4.36.6/arch/i386/mm/ioremap.c linux-2.4.36.6-pax/arch/i386/mm/ioremap.c --- linux-2.4.36.6/arch/i386/mm/ioremap.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/mm/ioremap.c 2008-05-08 12:41:32.000000000 +0200 @@ -49,7 +49,7 @@ static inline int remap_area_pmd(pmd_t * if (address >= end) BUG(); do { - pte_t * pte = pte_alloc(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); diff -NurpX nopatch linux-2.4.36.6/arch/i386/mm/pageattr.c linux-2.4.36.6-pax/arch/i386/mm/pageattr.c --- linux-2.4.36.6/arch/i386/mm/pageattr.c 2008-01-01 13:06:40.000000000 +0100 +++ linux-2.4.36.6-pax/arch/i386/mm/pageattr.c 2008-05-08 12:41:32.000000000 +0200 @@ -10,6 +10,7 @@ #include #include #include +#include /* Should move most of this stuff into the appropiate includes */ #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) @@ -63,7 +64,19 @@ static void flush_kernel_map(void * addr static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { + +#ifdef CONFIG_PAX_KERNEXEC + unsigned long cr0; + + pax_open_kernel(cr0); +#endif + set_pte_atomic(kpte, pte); /* change init_mm */ + +#ifdef CONFIG_PAX_KERNEXEC + pax_close_kernel(cr0); +#endif + #ifndef CONFIG_X86_PAE { struct list_head *l; diff -Nu