119 files changed, 35709 insertions, 0 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
new file mode 100644
index 000000000000..80c38c5d71fe
--- /dev/null
+++ b/arch/x86_64/Kconfig
@@ -0,0 +1,477 @@
+#
+# For a description of the syntax of this configuration file,
+# see Documentation/kbuild/kconfig-language.txt.
+#
+# Note: ISA is disabled and will hopefully never be enabled.
+# If you managed to buy an ISA x86-64 box you'll have to fix all the
+# ISA drivers you need yourself.
+#
+
+mainmenu "Linux Kernel Configuration"
+
+config X86_64
+	bool
+	default y
+	help
+	  Port to the x86-64 architecture. x86-64 is a 64-bit extension to the
+	  classical 32-bit x86 architecture. For details see
+	  <http://www.x86-64.org/>.
+
+config 64BIT
+	def_bool y
+
+config X86
+	bool
+	default y
+
+config MMU
+	bool
+	default y
+
+config ISA
+	bool
+
+config SBUS
+	bool
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
+config GENERIC_CALIBRATE_DELAY
+	bool
+	default y
+
+config X86_CMPXCHG
+	bool
+	default y
+
+config EARLY_PRINTK
+	bool
+	default y
+
+config GENERIC_ISA_DMA
+	bool
+	default y
+
+config GENERIC_IOMAP
+	bool
+	default y
+
+source "init/Kconfig"
+
+
+menu "Processor type and features"
+
+choice
+	prompt "Processor family"
+	default MK8
+
+config MK8
+	bool "AMD-Opteron/Athlon64"
+	help
+	  Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs.
+
+config MPSC
+       bool "Intel EM64T"
+       help
+	  Optimize for Intel Pentium 4 and Xeon CPUs with Intel
+	  Extended Memory 64 Technology(EM64T). For details see
+	  <http://www.intel.com/technology/64bitextensions/>.
+
+config GENERIC_CPU
+	bool "Generic-x86-64"
+	help
+	  Generic x86-64 CPU.
+
+endchoice
+
+#
+# Define implied options from the CPU selection here
+#
+config X86_L1_CACHE_BYTES
+	int
+	default "128" if GENERIC_CPU || MPSC
+	default "64" if MK8
+
+config X86_L1_CACHE_SHIFT
+	int
+	default "7" if GENERIC_CPU || MPSC
+	default "6" if MK8
+
+config X86_TSC
+	bool
+	default y
+
+config X86_GOOD_APIC
+	bool
+	default y
+
+config MICROCODE
+	tristate "/dev/cpu/microcode - Intel CPU microcode support"
+	---help---
+	  If you say Y here the 'File systems' section, you will be
+	  able to update the microcode on Intel processors. You will
+	  obviously need the actual microcode binary data itself which is
+	  not shipped with the Linux kernel.
+
+	  For latest news and information on obtaining all the required
+	  ingredients for this driver, check:
+	  <http://www.urbanmyth.org/microcode/>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called microcode.
+	  If you use modprobe or kmod you may also want to add the line
+	  'alias char-major-10-184 microcode' to your /etc/modules.conf file.
+
+config X86_MSR
+	tristate "/dev/cpu/*/msr - Model-specific register support"
+	help
+	  This device gives privileged processes access to the x86
+	  Model-Specific Registers (MSRs).  It is a character device with
+	  major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
+	  MSR accesses are directed to a specific CPU on multi-processor
+	  systems.
+
+config X86_CPUID
+	tristate "/dev/cpu/*/cpuid - CPU information support"
+	help
+	  This device gives processes access to the x86 CPUID instruction to
+	  be executed on a specific processor.  It is a character device
+	  with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
+	  /dev/cpu/31/cpuid.
+
+# disable it for opteron optimized builds because it pulls in ACPI_BOOT
+config X86_HT
+	bool
+	depends on SMP && !MK8
+	default y
+
+config MATH_EMULATION
+	bool
+
+config MCA
+	bool
+
+config EISA
+	bool
+
+config X86_IO_APIC
+	bool
+	default y
+
+config X86_LOCAL_APIC
+	bool
+	default y
+
+config MTRR
+	bool "MTRR (Memory Type Range Register) support"
+	---help---
+	  On Intel P6 family processors (Pentium Pro, Pentium II and later)
+	  the Memory Type Range Registers (MTRRs) may be used to control
+	  processor access to memory ranges. This is most useful if you have
+	  a video (VGA) card on a PCI or AGP bus. Enabling write-combining
+	  allows bus write transfers to be combined into a larger transfer
+	  before bursting over the PCI/AGP bus. This can increase performance
+	  of image write operations 2.5 times or more. Saying Y here creates a
+	  /proc/mtrr file which may be used to manipulate your processor's
+	  MTRRs. Typically the X server should use this.
+
+	  This code has a reasonably generic interface so that similar
+	  control registers on other processors can be easily supported
+	  as well.
+
+	  Saying Y here also fixes a problem with buggy SMP BIOSes which only
+	  set the MTRRs for the boot CPU and not for the secondary CPUs. This
+	  can lead to all sorts of problems, so it's good to say Y here.
+
+	  Just say Y here, all x86-64 machines support MTRRs.
+
+	  See <file:Documentation/mtrr.txt> for more information.
+
+config SMP
+	bool "Symmetric multi-processing support"
+	---help---
+	  This enables support for systems with more than one CPU. If you have
+	  a system with only one CPU, like most personal computers, say N. If
+	  you have a system with more than one CPU, say Y.
+
+	  If you say N here, the kernel will run on single and multiprocessor
+	  machines, but will use only one CPU of a multiprocessor machine. If
+	  you say Y here, the kernel will run on many, but not all,
+	  singleprocessor machines. On a singleprocessor machine, the kernel
+	  will run faster if you say N here.
+
+	  If you don't know what to do here, say N.
+
+config PREEMPT
+	bool "Preemptible Kernel"
+	---help---
+	  This option reduces the latency of the kernel when reacting to
+	  real-time or interactive events by allowing a low priority process to
+	  be preempted even if it is in kernel mode executing a system call.
+	  This allows applications to run more reliably even when the system is
+	  under load. On contrary it may also break your drivers and add
+	  priority inheritance problems to your system. Don't select it if
+	  you rely on a stable system or have slightly obscure hardware.
+	  It's also not very well tested on x86-64 currently.
+	  You have been warned.
+
+	  Say Y here if you are feeling brave and building a kernel for a
+	  desktop, embedded or real-time system.  Say N if you are unsure.
+
+config PREEMPT_BKL
+	bool "Preempt The Big Kernel Lock"
+	depends on PREEMPT
+	default y
+	help
+	  This option reduces the latency of the kernel by making the
+	  big kernel lock preemptible.
+
+	  Say Y here if you are building a kernel for a desktop system.
+	  Say N if you are unsure.
+
+config SCHED_SMT
+	bool "SMT (Hyperthreading) scheduler support"
+	depends on SMP
+	default n
+	help
+	  SMT scheduler support improves the CPU scheduler's decision making
+	  when dealing with Intel Pentium 4 chips with HyperThreading at a
+	  cost of slightly increased overhead in some places. If unsure say
+	  N here.
+
+config K8_NUMA
+       bool "K8 NUMA support"
+       select NUMA
+       depends on SMP
+       help
+	  Enable NUMA (Non Unified Memory Architecture) support for
+	  AMD Opteron Multiprocessor systems. The kernel will try to allocate
+	  memory used by a CPU on the local memory controller of the CPU
+	  and add some more NUMA awareness to the kernel.
+	  This code is recommended on all multiprocessor Opteron systems
+	  and normally doesn't hurt on others.
+
+config NUMA_EMU
+	bool "NUMA emulation support"
+	select NUMA
+	depends on SMP
+	help
+	  Enable NUMA emulation. A flat machine will be split
+	  into virtual nodes when booted with "numa=fake=N", where N is the
+	  number of nodes. This is only useful for debugging.
+
+config DISCONTIGMEM
+       bool
+       depends on NUMA
+       default y
+
+config NUMA
+       bool
+       default n
+
+config HAVE_DEC_LOCK
+	bool
+	depends on SMP
+	default y
+
+config NR_CPUS
+	int "Maximum number of CPUs (2-256)"
+	range 2 256
+	depends on SMP
+	default "8"
+	help
+	  This allows you to specify the maximum number of CPUs which this
+	  kernel will support. Current maximum is 256 CPUs due to
+	  APIC addressing limits. Less depending on the hardware.
+
+	  This is purely to save memory - each supported CPU requires
+	  memory in the static kernel configuration.
+
+config HPET_TIMER
+	bool
+	default y
+	help
+	  Use the IA-PC HPET (High Precision Event Timer) to manage
+	  time in preference to the PIT and RTC, if a HPET is
+	  present.  The HPET provides a stable time base on SMP
+	  systems, unlike the TSC, but it is more expensive to access,
+	  as it is off-chip.  You can find the HPET spec at
+	  <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>.
+
+config HPET_EMULATE_RTC
+	bool "Provide RTC interrupt"
+	depends on HPET_TIMER && RTC=y
+
+config GART_IOMMU
+	bool "IOMMU support"
+	depends on PCI
+	help
+	  Support the K8 IOMMU. Needed to run systems with more than 4GB of memory
+	  properly with 32-bit PCI devices that do not support DAC (Double Address
+	  Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter.
+	  Normally the kernel will take the right choice by itself.
+	  If unsure, say Y.
+
+# need this always enabled with GART_IOMMU for the VIA workaround
+config SWIOTLB
+       bool
+       depends on GART_IOMMU
+       default y
+
+config DUMMY_IOMMU
+	bool
+	depends on !GART_IOMMU && !SWIOTLB
+	default y
+	help
+	  Don't use IOMMU code. This will cause problems when you have more than 4GB
+	  of memory and any 32-bit devices. Don't turn on unless you know what you
+	  are doing.
+
+config X86_MCE
+	bool "Machine check support" if EMBEDDED
+	default y
+	help
+	   Include a machine check error handler to report hardware errors.
+	   This version will require the mcelog utility to decode some
+	   machine check error logs. See
+	   ftp://ftp.x86-64.org/pub/linux/tools/mcelog
+
+config X86_MCE_INTEL
+	bool "Intel MCE features"
+	depends on X86_MCE && X86_LOCAL_APIC
+	default y
+	help
+	   Additional support for intel specific MCE features such as
+	   the thermal monitor.
+
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	depends on PROC_FS
+	default y
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via /proc/<pid>/seccomp, it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
+	  If unsure, say Y. Only embedded should say N here.
+
+endmenu
+
+#
+# Use the generic interrupt handling code in kernel/irq/:
+#
+config GENERIC_HARDIRQS
+	bool
+	default y
+
+config GENERIC_IRQ_PROBE
+	bool
+	default y
+
+menu "Power management options"
+
+source kernel/power/Kconfig
+
+source "drivers/acpi/Kconfig"
+
+source "arch/x86_64/kernel/cpufreq/Kconfig"
+
+endmenu
+
+menu "Bus options (PCI etc.)"
+
+config PCI
+	bool "PCI support"
+
+# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
+config PCI_DIRECT
+	bool
+	depends on PCI
+	default y
+
+config PCI_MMCONFIG
+	bool "Support mmconfig PCI config space access"
+	depends on PCI
+	select ACPI_BOOT
+
+config UNORDERED_IO
+       bool "Unordered IO mapping access"
+       depends on EXPERIMENTAL
+       help
+         Use unordered stores to access IO memory mappings in device drivers.
+	 Still very experimental. When a driver works on IA64/ppc64/pa-risc it should
+	 work with this option, but it makes the drivers behave differently
+	 from i386. Requires that the driver writer used memory barriers
+	 properly.
+
+source "drivers/pci/pcie/Kconfig"
+
+source "drivers/pci/Kconfig"
+
+source "drivers/pcmcia/Kconfig"
+
+source "drivers/pci/hotplug/Kconfig"
+
+endmenu
+
+
+menu "Executable file formats / Emulations"
+
+source "fs/Kconfig.binfmt"
+
+config IA32_EMULATION
+	bool "IA32 Emulation"
+	help
+	  Include code to run 32-bit programs under a 64-bit kernel. You should likely
+	  turn this on, unless you're 100% sure that you don't have any 32-bit programs
+	  left.
+
+config IA32_AOUT
+       bool "IA32 a.out support"
+       depends on IA32_EMULATION
+       help
+         Support old a.out binaries in the 32bit emulation.
+
+config COMPAT
+	bool
+	depends on IA32_EMULATION
+	default y
+
+config SYSVIPC_COMPAT
+	bool
+	depends on COMPAT && SYSVIPC
+	default y
+
+config UID16
+	bool
+	depends on IA32_EMULATION
+	default y
+
+endmenu
+
+source drivers/Kconfig
+
+source "drivers/firmware/Kconfig"
+
+source fs/Kconfig
+
+source "arch/x86_64/oprofile/Kconfig"
+
+source "arch/x86_64/Kconfig.debug"
+
+source "security/Kconfig"
+
+source "crypto/Kconfig"
+
+source "lib/Kconfig"
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
new file mode 100644
index 000000000000..9cf1410d2f5a
--- /dev/null
+++ b/arch/x86_64/Kconfig.debug
@@ -0,0 +1,57 @@
+menu "Kernel hacking"
+
+source "lib/Kconfig.debug"
+
+# !SMP for now because the context switch early causes GPF in segment reloading
+# and the GS base checking does the wrong thing then, causing a hang.
+config CHECKING
+	bool "Additional run-time checks"
+	depends on DEBUG_KERNEL && !SMP
+	help
+	  Enables some internal consistency checks for kernel debugging.
+	  You should normally say N.
+
+config INIT_DEBUG
+	bool "Debug __init statements"
+	depends on DEBUG_KERNEL
+	help
+	  Fill __init and __initdata at the end of boot. This helps debugging
+	  illegal uses of __init and __initdata after initialization.
+
+config IOMMU_DEBUG
+       depends on GART_IOMMU && DEBUG_KERNEL
+       bool "Enable IOMMU debugging"
+       help
+         Force the IOMMU to on even when you have less than 4GB of
+	 memory and add debugging code. On overflow always panic. And
+	 allow to enable IOMMU leak tracing. Can be disabled at boot
+	 time with iommu=noforce. This will also enable scatter gather
+	 list merging.  Currently not recommended for production
+	 code. When you use it make sure you have a big enough
+	 IOMMU/AGP aperture.  Most of the options enabled by this can
+	 be set more finegrained using the iommu= command line
+	 options. See Documentation/x86_64/boot-options.txt for more
+	 details.
+
+config KPROBES
+	bool "Kprobes"
+	depends on DEBUG_KERNEL
+	help
+	  Kprobes allows you to trap at almost any kernel address and
+	  execute a callback function.  register_kprobe() establishes
+	  a probepoint and specifies the callback.  Kprobes is useful
+	  for kernel debugging, non-intrusive instrumentation and testing.
+	  If in doubt, say "N".
+
+config IOMMU_LEAK
+       bool "IOMMU leak tracing"
+       depends on DEBUG_KERNEL
+       depends on IOMMU_DEBUG
+       help
+         Add a simple leak tracer to the IOMMU code. This is useful when you
+	 are debugging a buggy device driver that leaks IOMMU mappings.
+
+#config X86_REMOTE_DEBUG
+#       bool "kgdb debugging stub"
+
+endmenu
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
new file mode 100644
index 000000000000..6f90c246c418
--- /dev/null
+++ b/arch/x86_64/Makefile
@@ -0,0 +1,119 @@
+#
+# x86_64/Makefile
+#
+# This file is included by the global makefile so that you can add your own
+# architecture-specific flags and dependencies. Remember to do have actions
+# for "archclean" and "archdep" for cleaning up and making dependencies for
+# this architecture
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1994 by Linus Torvalds
+#
+# 19990713  Artur Skawina <skawina@geocities.com>
+#           Added '-march' and '-mpreferred-stack-boundary' support
+# 20000913  Pavel Machek <pavel@suse.cz>
+#	    Converted for x86_64 architecture
+# 20010105  Andi Kleen, add IA32 compiler.
+#           ....and later removed it again....
+#
+# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $
+
+#
+# early bootup linking needs 32bit. You can either use real 32bit tools
+# here or 64bit tools in 32bit mode.
+#
+IA32_CC := $(CC) $(CPPFLAGS) -m32 -O2 -fomit-frame-pointer
+IA32_LD := $(LD) -m elf_i386
+IA32_AS := $(CC) $(AFLAGS) -m32 -Wa,--32 -traditional -c
+IA32_OBJCOPY := $(CROSS_COMPILE)objcopy
+IA32_CPP := $(CROSS_COMPILE)gcc -m32 -E
+export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP
+
+
+LDFLAGS		:= -m elf_x86_64
+OBJCOPYFLAGS	:= -O binary -R .note -R .comment -S
+LDFLAGS_vmlinux := -e stext
+
+CHECKFLAGS      += -D__x86_64__ -m64
+
+cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
+cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
+CFLAGS += $(cflags-y)
+
+CFLAGS += -mno-red-zone
+CFLAGS += -mcmodel=kernel
+CFLAGS += -pipe
+# this makes reading assembly source easier, but produces worse code
+# actually it makes the kernel smaller too.
+CFLAGS += -fno-reorder-blocks	
+CFLAGS += -Wno-sign-compare
+ifneq ($(CONFIG_DEBUG_INFO),y)
+CFLAGS += -fno-asynchronous-unwind-tables
+# -fweb shrinks the kernel a bit, but the difference is very small
+# it also messes up debugging, so don't use it for now.
+#CFLAGS += $(call cc-option,-fweb)
+endif
+# -funit-at-a-time shrinks the kernel .text considerably
+# unfortunately it makes reading oopses harder.
+CFLAGS += $(call cc-option,-funit-at-a-time)
+# prevent gcc from generating any FP code by mistake
+CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+
+head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
+
+libs-y 					+= arch/x86_64/lib/
+core-y					+= arch/x86_64/kernel/ arch/x86_64/mm/
+core-$(CONFIG_IA32_EMULATION)		+= arch/x86_64/ia32/
+drivers-$(CONFIG_PCI)			+= arch/x86_64/pci/
+drivers-$(CONFIG_OPROFILE)		+= arch/x86_64/oprofile/
+
+boot := arch/x86_64/boot
+
+.PHONY: bzImage bzlilo install archmrproper \
+	fdimage fdimage144 fdimage288 archclean
+
+#Default target when executing "make"
+all: bzImage
+
+BOOTIMAGE                     := arch/x86_64/boot/bzImage
+KBUILD_IMAGE                  := $(BOOTIMAGE)
+
+bzImage: vmlinux
+	$(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE)
+
+bzlilo: vmlinux
+	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo
+
+bzdisk: vmlinux
+	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk
+
+install fdimage fdimage144 fdimage288: vmlinux
+	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
+
+archclean:
+	$(Q)$(MAKE) $(clean)=$(boot)
+
+prepare: include/asm-$(ARCH)/offset.h
+
+arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \
+				   include/config/MARKER
+
+include/asm-$(ARCH)/offset.h: arch/$(ARCH)/kernel/asm-offsets.s
+	$(call filechk,gen-asm-offsets)
+
+CLEAN_FILES += include/asm-$(ARCH)/offset.h
+
+define archhelp
+  echo  '* bzImage	- Compressed kernel image (arch/$(ARCH)/boot/bzImage)'
+  echo  '  install	- Install kernel using'
+  echo  '                  (your) ~/bin/installkernel or'
+  echo  '                  (distribution) /sbin/installkernel or'
+  echo  '        	  install to $$(INSTALL_PATH) and run lilo'
+endef
+
+CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
+
+
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile
new file mode 100644
index 000000000000..f4399c701b77
--- /dev/null
+++ b/arch/x86_64/boot/Makefile
@@ -0,0 +1,102 @@
+#
+# arch/x86_64/boot/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1994 by Linus Torvalds
+#
+
+# ROOT_DEV specifies the default root-device when making the image.
+# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
+# the default of FLOPPY is used by 'build'.
+
+ROOT_DEV := CURRENT
+
+# If you want to preset the SVGA mode, uncomment the next line and
+# set SVGA_MODE to whatever number you want.
+# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
+# The number is the same as you would ordinarily press at bootup.
+
+SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
+
+# If you want the RAM disk device, define this to be the size in blocks.
+
+#RAMDISK := -DRAMDISK=512
+
+targets		:= vmlinux.bin bootsect bootsect.o \
+		   setup setup.o bzImage mtools.conf
+
+EXTRA_CFLAGS := -m32
+
+hostprogs-y	:= tools/build
+HOST_EXTRACFLAGS += $(LINUXINCLUDE)
+subdir-		:= compressed/	#Let make clean descend in compressed/
+# ---------------------------------------------------------------------------
+
+$(obj)/bzImage: IMAGE_OFFSET := 0x100000
+$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
+$(obj)/bzImage: BUILDFLAGS   := -b
+
+quiet_cmd_image = BUILD   $@
+cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \
+	    $(obj)/vmlinux.bin $(ROOT_DEV) > $@
+
+$(obj)/bzImage: $(obj)/bootsect $(obj)/setup \
+			      $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+	$(call if_changed,image)
+	@echo 'Kernel: $@ is ready'
+
+$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
+	$(call if_changed,objcopy)
+
+LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary
+LDFLAGS_setup	 := -Ttext 0x0 -s --oformat binary -e begtext
+
+$(obj)/setup $(obj)/bootsect: %: %.o FORCE
+	$(call if_changed,ld)
+
+$(obj)/compressed/vmlinux: FORCE
+	$(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
+
+# Set this if you want to pass append arguments to the zdisk/fdimage kernel
+FDARGS = 
+
+$(obj)/mtools.conf: $(src)/mtools.conf.in
+	sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+
+# This requires write access to /dev/fd0
+zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
+	MTOOLSRC=$(obj)/mtools.conf mformat a:			; sync
+	syslinux /dev/fd0					; sync
+	echo 'default linux $(FDARGS)' | \
+		MTOOLSRC=$(obj)/mtools.conf mcopy - a:syslinux.cfg
+	MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux	; sync
+
+# These require being root or having syslinux 2.02 or higher installed
+fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
+	dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
+	MTOOLSRC=$(obj)/mtools.conf mformat v:			; sync
+	syslinux $(obj)/fdimage					; sync
+	echo 'default linux $(FDARGS)' | \
+		MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
+	MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux	; sync
+
+fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
+	dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
+	MTOOLSRC=$(obj)/mtools.conf mformat w:			; sync
+	syslinux $(obj)/fdimage					; sync
+	echo 'default linux $(FDARGS)' | \
+		MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
+	MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux	; sync
+
+zlilo: $(BOOTIMAGE)
+	if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
+	if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
+	cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
+	cp System.map $(INSTALL_PATH)/
+	if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
+
+install: $(BOOTIMAGE)
+	sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
diff --git a/arch/x86_64/boot/bootsect.S b/arch/x86_64/boot/bootsect.S
new file mode 100644
index 000000000000..bb15d406ee95
--- /dev/null
+++ b/arch/x86_64/boot/bootsect.S
@@ -0,0 +1,98 @@
+/*
+ *	bootsect.S		Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *	modified by Drew Eckhardt
+ *	modified by Bruce Evans (bde)
+ *	modified by Chris Noe (May 1999) (as86 -> gas)
+ *	gutted by H. Peter Anvin (Jan 2003)
+ *
+ * BIG FAT NOTE: We're in real mode using 64k segments.  Therefore segment
+ * addresses must be multiplied by 16 to obtain their respective linear
+ * addresses. To avoid confusion, linear addresses are written using leading
+ * hex while segment addresses are written as segment:offset.
+ *
+ */
+
+#include <asm/boot.h>
+
+SETUPSECTS	= 4			/* default nr of setup-sectors */
+BOOTSEG		= 0x07C0		/* original address of boot-sector */
+INITSEG		= DEF_INITSEG		/* we move boot here - out of the way */
+SETUPSEG	= DEF_SETUPSEG		/* setup starts here */
+SYSSEG		= DEF_SYSSEG		/* system loaded at 0x10000 (65536) */
+SYSSIZE		= DEF_SYSSIZE		/* system size: # of 16-byte clicks */
+					/* to be loaded */
+ROOT_DEV	= 0 			/* ROOT_DEV is now written by "build" */
+SWAP_DEV	= 0			/* SWAP_DEV is now written by "build" */
+
+#ifndef SVGA_MODE
+#define SVGA_MODE ASK_VGA
+#endif
+
+#ifndef RAMDISK
+#define RAMDISK 0
+#endif
+
+#ifndef ROOT_RDONLY
+#define ROOT_RDONLY 1
+#endif
+
+.code16
+.text
+
+.global _start
+_start:
+
+	# Normalize the start address
+	jmpl	$BOOTSEG, $start2
+
+start2:
+	movw	%cs, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+	movw	%ax, %ss
+	movw	$0x7c00, %sp
+	sti
+	cld
+
+	movw	$bugger_off_msg, %si
+
+msg_loop:
+	lodsb
+	andb	%al, %al
+	jz	die
+	movb	$0xe, %ah
+	movw	$7, %bx
+ 	int	$0x10
+	jmp	msg_loop
+
+die:
+	# Allow the user to press a key, then reboot
+	xorw	%ax, %ax
+	int	$0x16
+	int	$0x19
+	
+	# int 0x19 should never return.  In case it does anyway,
+	# invoke the BIOS reset code...
+	ljmp	$0xf000,$0xfff0
+
+
+bugger_off_msg:
+	.ascii	"Direct booting from floppy is no longer supported.\r\n"
+	.ascii	"Please use a boot loader program instead.\r\n"
+	.ascii	"\n"
+	.ascii	"Remove disk and press any key to reboot . . .\r\n"
+	.byte	0
+
+
+	# Kernel attributes; used by setup
+
+	.org 497
+setup_sects:	.byte SETUPSECTS
+root_flags:	.word ROOT_RDONLY
+syssize:	.word SYSSIZE
+swap_dev:	.word SWAP_DEV
+ram_size:	.word RAMDISK
+vid_mode:	.word SVGA_MODE
+root_dev:	.word ROOT_DEV
+boot_flag:	.word 0xAA55
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
new file mode 100644
index 000000000000..f89d96f11a9f
--- /dev/null
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -0,0 +1,32 @@
+#
+# linux/arch/x86_64/boot/compressed/Makefile
+#
+# create a compressed vmlinux image from the original vmlinux
+#
+# Note all the files here are compiled/linked as 32bit executables.
+#
+
+targets		:= vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
+EXTRA_AFLAGS	:= -traditional -m32
+
+# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
+# -m32
+CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2  -fno-strict-aliasing
+LDFLAGS := -m elf_i386
+
+LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386
+
+$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
+	$(call if_changed,ld)
+	@:
+
+$(obj)/vmlinux.bin: vmlinux FORCE
+	$(call if_changed,objcopy)
+
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,gzip)
+
+LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
+
+$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
+	$(call if_changed,ld)
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
new file mode 100644
index 000000000000..27264dbd575c
--- /dev/null
+++ b/arch/x86_64/boot/compressed/head.S
@@ -0,0 +1,142 @@
+/*
+ *  linux/boot/head.S
+ *
+ *  Copyright (C) 1991, 1992, 1993  Linus Torvalds
+ *
+ *  $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $	 		
+ */
+
+/*
+ *  head.S contains the 32-bit startup code.
+ *
+ * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
+ * the page directory will exist. The startup code will be overwritten by
+ * the page directory. [According to comments etc elsewhere on a compressed
+ * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
+ *
+ * Page 0 is deliberately kept safe, since System Management Mode code in 
+ * laptops may need to access the BIOS data stored there.  This is also
+ * useful for future device drivers that either access the BIOS via VM86 
+ * mode.
+ */
+
+/*
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996	
+ */
+.code32
+.text
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+
+	.code32
+	.globl startup_32
+	
+startup_32:
+	cld
+	cli
+	movl $(__KERNEL_DS),%eax
+	movl %eax,%ds
+	movl %eax,%es
+	movl %eax,%fs
+	movl %eax,%gs
+
+	lss stack_start,%esp
+	xorl %eax,%eax
+1:	incl %eax		# check that A20 really IS enabled
+	movl %eax,0x000000	# loop forever if it isn't
+	cmpl %eax,0x100000
+	je 1b
+
+/*
+ * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
+ * confuse the debugger if this code is traced.
+ * XXX - best to initialize before switching to protected mode.
+ */
+	pushl $0
+	popfl
+/*
+ * Clear BSS
+ */
+	xorl %eax,%eax
+	movl $_edata,%edi
+	movl $_end,%ecx
+	subl %edi,%ecx
+	cld
+	rep
+	stosb
+/*
+ * Do the decompression, and jump to the new kernel..
+ */
+	subl $16,%esp	# place for structure on the stack
+	movl %esp,%eax
+	pushl %esi	# real mode pointer as second arg
+	pushl %eax	# address of structure as first arg
+	call decompress_kernel
+	orl  %eax,%eax 
+	jnz  3f
+	addl $8,%esp
+	xorl %ebx,%ebx
+	ljmp $(__KERNEL_CS), $0x100000
+
+/*
+ * We come here, if we were loaded high.
+ * We need to move the move-in-place routine down to 0x1000
+ * and then start it with the buffer addresses in registers,
+ * which we got from the stack.
+ */
+3:
+	movl %esi,%ebx	
+	movl $move_routine_start,%esi
+	movl $0x1000,%edi
+	movl $move_routine_end,%ecx
+	subl %esi,%ecx
+	addl $3,%ecx
+	shrl $2,%ecx
+	cld
+	rep
+	movsl
+
+	popl %esi	# discard the address
+	addl $4,%esp	# real mode pointer
+	popl %esi	# low_buffer_start
+	popl %ecx	# lcount
+	popl %edx	# high_buffer_start
+	popl %eax	# hcount
+	movl $0x100000,%edi
+	cli		# make sure we don't get interrupted
+	ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
+
+/*
+ * Routine (template) for moving the decompressed kernel in place,
+ * if we were high loaded. This _must_ PIC-code !
+ */
+move_routine_start:
+	movl %ecx,%ebp
+	shrl $2,%ecx
+	rep
+	movsl
+	movl %ebp,%ecx
+	andl $3,%ecx
+	rep
+	movsb
+	movl %edx,%esi
+	movl %eax,%ecx	# NOTE: rep movsb won't move if %ecx == 0
+	addl $3,%ecx
+	shrl $2,%ecx
+	rep
+	movsl
+	movl %ebx,%esi	# Restore setup pointer
+	xorl %ebx,%ebx
+	ljmp $(__KERNEL_CS), $0x100000
+move_routine_end:
+
+
+/* Stack for uncompression */ 	
+	.align 32
+user_stack:	 	
+	.fill 4096,4,0
+stack_start:	
+	.long user_stack+4096
+	.word __KERNEL_DS
+
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
new file mode 100644
index 000000000000..c8b9216f9e63
--- /dev/null
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -0,0 +1,354 @@
+/*
+ * misc.c
+ * 
+ * This is a collection of several routines from gzip-1.0.3 
+ * adapted for Linux.
+ *
+ * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
+ * puts by Nick Holloway 1993, better puts by Martin Mares 1995
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+
+#include "miscsetup.h"
+#include <asm/io.h>
+
+/*
+ * gzip declarations
+ */
+
+#define OF(args)  args
+#define STATIC static
+
+#undef memset
+#undef memcpy
+#define memzero(s, n)     memset ((s), 0, (n))
+
+typedef unsigned char  uch;
+typedef unsigned short ush;
+typedef unsigned long  ulg;
+
+#define WSIZE 0x8000		/* Window size must be at least 32k, */
+				/* and a power of two */
+
+static uch *inbuf;	     /* input buffer */
+static uch window[WSIZE];    /* Sliding window buffer */
+
+static unsigned insize = 0;  /* valid bytes in inbuf */
+static unsigned inptr = 0;   /* index of next byte to be processed in inbuf */
+static unsigned outcnt = 0;  /* bytes in output buffer */
+
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
+#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define ENCRYPTED    0x20 /* bit 5 set: file is encrypted */
+#define RESERVED     0xC0 /* bit 6,7:   reserved */
+
+#define get_byte()  (inptr < insize ? inbuf[inptr++] : fill_inbuf())
+		
+/* Diagnostic functions */
+#ifdef DEBUG
+#  define Assert(cond,msg) {if(!(cond)) error(msg);}
+#  define Trace(x) fprintf x
+#  define Tracev(x) {if (verbose) fprintf x ;}
+#  define Tracevv(x) {if (verbose>1) fprintf x ;}
+#  define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
+#  define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
+#else
+#  define Assert(cond,msg)
+#  define Trace(x)
+#  define Tracev(x)
+#  define Tracevv(x)
+#  define Tracec(c,x)
+#  define Tracecv(c,x)
+#endif
+
+static int  fill_inbuf(void);
+static void flush_window(void);
+static void error(char *m);
+static void gzip_mark(void **);
+static void gzip_release(void **);
+  
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+static unsigned char *real_mode; /* Pointer to real-mode data */
+
+#define EXT_MEM_K   (*(unsigned short *)(real_mode + 0x2))
+#ifndef STANDARD_MEMORY_BIOS_CALL
+#define ALT_MEM_K   (*(unsigned long *)(real_mode + 0x1e0))
+#endif
+#define SCREEN_INFO (*(struct screen_info *)(real_mode+0))
+
+extern char input_data[];
+extern int input_len;
+
+static long bytes_out = 0;
+static uch *output_data;
+static unsigned long output_ptr = 0;
+
+static void *malloc(int size);
+static void free(void *where);
+ 
+static void putstr(const char *);
+  
+extern int end;
+static long free_mem_ptr = (long)&end;
+static long free_mem_end_ptr;
+
+#define INPLACE_MOVE_ROUTINE  0x1000
+#define LOW_BUFFER_START      0x2000
+#define LOW_BUFFER_MAX       0x90000
+#define HEAP_SIZE             0x3000
+static unsigned int low_buffer_end, low_buffer_size;
+static int high_loaded =0;
+static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
+
+static char *vidmem = (char *)0xb8000;
+static int vidport;
+static int lines, cols;
+
+#include "../../../../lib/inflate.c"
+
+static void *malloc(int size)
+{
+	void *p;
+
+	if (size <0) error("Malloc error");
+	if (free_mem_ptr <= 0) error("Memory error");
+
+	free_mem_ptr = (free_mem_ptr + 3) & ~3;	/* Align */
+
+	p = (void *)free_mem_ptr;
+	free_mem_ptr += size;
+
+	if (free_mem_ptr >= free_mem_end_ptr)
+		error("Out of memory");
+
+	return p;
+}
+
+static void free(void *where)
+{	/* Don't care */
+}
+
+static void gzip_mark(void **ptr)
+{
+	*ptr = (void *) free_mem_ptr;
+}
+
+static void gzip_release(void **ptr)
+{
+	free_mem_ptr = (long) *ptr;
+}
+ 
+static void scroll(void)
+{
+	int i;
+
+	memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
+	for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
+		vidmem[i] = ' ';
+}
+
+static void putstr(const char *s)
+{
+	int x,y,pos;
+	char c;
+
+	x = SCREEN_INFO.orig_x;
+	y = SCREEN_INFO.orig_y;
+
+	while ( ( c = *s++ ) != '\0' ) {
+		if ( c == '\n' ) {
+			x = 0;
+			if ( ++y >= lines ) {
+				scroll();
+				y--;
+			}
+		} else {
+			vidmem [ ( x + cols * y ) * 2 ] = c; 
+			if ( ++x >= cols ) {
+				x = 0;
+				if ( ++y >= lines ) {
+					scroll();
+					y--;
+				}
+			}
+		}
+	}
+
+	SCREEN_INFO.orig_x = x;
+	SCREEN_INFO.orig_y = y;
+
+	pos = (x + cols * y) * 2;	/* Update cursor position */
+	outb_p(14, vidport);
+	outb_p(0xff & (pos >> 9), vidport+1);
+	outb_p(15, vidport);
+	outb_p(0xff & (pos >> 1), vidport+1);
+}
+
+void* memset(void* s, int c, unsigned n)
+{
+	int i;
+	char *ss = (char*)s;
+
+	for (i=0;i<n;i++) ss[i] = c;
+	return s;
+}
+
+void* memcpy(void* dest, const void* src, unsigned n)
+{
+	int i;
+	char *d = (char *)dest, *s = (char *)src;
+
+	for (i=0;i<n;i++) d[i] = s[i];
+	return dest;
+}
+
+/* ===========================================================================
+ * Fill the input buffer. This is called only when the buffer is empty
+ * and at least one byte is really needed.
+ */
+static int fill_inbuf(void)
+{
+	if (insize != 0) {
+		error("ran out of input data");
+	}
+
+	inbuf = input_data;
+	insize = input_len;
+	inptr = 1;
+	return inbuf[0];
+}
+
+/* ===========================================================================
+ * Write the output window window[0..outcnt-1] and update crc and bytes_out.
+ * (Used for the decompressed data only.)
+ */
+static void flush_window_low(void)
+{
+    ulg c = crc;         /* temporary variable */
+    unsigned n;
+    uch *in, *out, ch;
+    
+    in = window;
+    out = &output_data[output_ptr]; 
+    for (n = 0; n < outcnt; n++) {
+	    ch = *out++ = *in++;
+	    c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+    }
+    crc = c;
+    bytes_out += (ulg)outcnt;
+    output_ptr += (ulg)outcnt;
+    outcnt = 0;
+}
+
+static void flush_window_high(void)
+{
+    ulg c = crc;         /* temporary variable */
+    unsigned n;
+    uch *in,  ch;
+    in = window;
+    for (n = 0; n < outcnt; n++) {
+	ch = *output_data++ = *in++;
+	if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
+	c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+    }
+    crc = c;
+    bytes_out += (ulg)outcnt;
+    outcnt = 0;
+}
+
+static void flush_window(void)
+{
+	if (high_loaded) flush_window_high();
+	else flush_window_low();
+}
+
+static void error(char *x)
+{
+	putstr("\n\n");
+	putstr(x);
+	putstr("\n\n -- System halted");
+
+	while(1);
+}
+
+void setup_normal_output_buffer(void)
+{
+#ifdef STANDARD_MEMORY_BIOS_CALL
+	if (EXT_MEM_K < 1024) error("Less than 2MB of memory");
+#else
+	if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
+#endif
+	output_data = (char *)0x100000; /* Points to 1M */
+	free_mem_end_ptr = (long)real_mode;
+}
+
+struct moveparams {
+	uch *low_buffer_start;  int lcount;
+	uch *high_buffer_start; int hcount;
+};
+
+void setup_output_buffer_if_we_run_high(struct moveparams *mv)
+{
+	high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
+#ifdef STANDARD_MEMORY_BIOS_CALL
+	if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
+#else
+	if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
+#endif	
+	mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START;
+	low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
+	  ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
+	low_buffer_size = low_buffer_end - LOW_BUFFER_START;
+	high_loaded = 1;
+	free_mem_end_ptr = (long)high_buffer_start;
+	if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) {
+		high_buffer_start = (uch *)(0x100000 + low_buffer_size);
+		mv->hcount = 0; /* say: we need not to move high_buffer */
+	}
+	else mv->hcount = -1;
+	mv->high_buffer_start = high_buffer_start;
+}
+
+void close_output_buffer_if_we_run_high(struct moveparams *mv)
+{
+	if (bytes_out > low_buffer_size) {
+		mv->lcount = low_buffer_size;
+		if (mv->hcount)
+			mv->hcount = bytes_out - low_buffer_size;
+	} else {
+		mv->lcount = bytes_out;
+		mv->hcount = 0;
+	}
+}
+
+int decompress_kernel(struct moveparams *mv, void *rmode)
+{
+	real_mode = rmode;
+
+	if (SCREEN_INFO.orig_video_mode == 7) {
+		vidmem = (char *) 0xb0000;
+		vidport = 0x3b4;
+	} else {
+		vidmem = (char *) 0xb8000;
+		vidport = 0x3d4;
+	}
+
+	lines = SCREEN_INFO.orig_video_lines;
+	cols = SCREEN_INFO.orig_video_cols;
+
+	if (free_mem_ptr < 0x100000) setup_normal_output_buffer();
+	else setup_output_buffer_if_we_run_high(mv);
+
+	makecrc();
+	putstr(".\nDecompressing Linux...");
+	gunzip();
+	putstr("done.\nBooting the kernel.\n");
+	if (high_loaded) close_output_buffer_if_we_run_high(mv);
+	return high_loaded;
+}
diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h
new file mode 100644
index 000000000000..bb1620531703
--- /dev/null
+++ b/arch/x86_64/boot/compressed/miscsetup.h
@@ -0,0 +1,39 @@
+#define NULL 0
+//typedef unsigned int size_t; 
+
+
+struct screen_info {
+	unsigned char  orig_x;			/* 0x00 */
+	unsigned char  orig_y;			/* 0x01 */
+	unsigned short dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
+	unsigned short orig_video_page;		/* 0x04 */
+	unsigned char  orig_video_mode;		/* 0x06 */
+	unsigned char  orig_video_cols;		/* 0x07 */
+	unsigned short unused2;			/* 0x08 */
+	unsigned short orig_video_ega_bx;	/* 0x0a */
+	unsigned short unused3;			/* 0x0c */
+	unsigned char  orig_video_lines;	/* 0x0e */
+	unsigned char  orig_video_isVGA;	/* 0x0f */
+	unsigned short orig_video_points;	/* 0x10 */
+
+	/* VESA graphic mode -- linear frame buffer */
+	unsigned short lfb_width;		/* 0x12 */
+	unsigned short lfb_height;		/* 0x14 */
+	unsigned short lfb_depth;		/* 0x16 */
+	unsigned long  lfb_base;		/* 0x18 */
+	unsigned long  lfb_size;		/* 0x1c */
+	unsigned short dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
+	unsigned short lfb_linelength;		/* 0x24 */
+	unsigned char  red_size;		/* 0x26 */
+	unsigned char  red_pos;			/* 0x27 */
+	unsigned char  green_size;		/* 0x28 */
+	unsigned char  green_pos;		/* 0x29 */
+	unsigned char  blue_size;		/* 0x2a */
+	unsigned char  blue_pos;		/* 0x2b */
+	unsigned char  rsvd_size;		/* 0x2c */
+	unsigned char  rsvd_pos;		/* 0x2d */
+	unsigned short vesapm_seg;		/* 0x2e */
+	unsigned short vesapm_off;		/* 0x30 */
+	unsigned short pages;			/* 0x32 */
+						/* 0x34 -- 0x3f reserved for future expansion */
+};
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr
new file mode 100644
index 000000000000..1ed9d791f863
--- /dev/null
+++ b/arch/x86_64/boot/compressed/vmlinux.scr
@@ -0,0 +1,9 @@
+SECTIONS
+{
+  .data : { 
+	input_len = .;
+	LONG(input_data_end - input_data) input_data = .; 
+	*(.data) 
+	input_data_end = .; 
+	}
+}
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
new file mode 100644
index 000000000000..90f2452b3b9e
--- /dev/null
+++ b/arch/x86_64/boot/install.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+#
+# arch/i386/boot/install.sh
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1995 by Linus Torvalds
+#
+# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
+#
+# "make install" script for i386 architecture
+#
+# Arguments:
+#   $1 - kernel version
+#   $2 - kernel image file
+#   $3 - kernel map file
+#   $4 - default install path (blank if root directory)
+#
+
+# User may have a custom install script
+
+if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
+if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
+
+# Default install - same as make zlilo
+
+if [ -f $4/vmlinuz ]; then
+	mv $4/vmlinuz $4/vmlinuz.old
+fi
+
+if [ -f $4/System.map ]; then
+	mv $4/System.map $4/System.old
+fi
+
+cat $2 > $4/vmlinuz
+cp $3 $4/System.map
+
+if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
diff --git a/arch/x86_64/boot/mtools.conf.in b/arch/x86_64/boot/mtools.conf.in
new file mode 100644
index 000000000000..efd6d2490c1d
--- /dev/null
+++ b/arch/x86_64/boot/mtools.conf.in
@@ -0,0 +1,17 @@
+#
+# mtools configuration file for "make (b)zdisk"
+#
+
+# Actual floppy drive
+drive a:
+  file="/dev/fd0"
+
+# 1.44 MB floppy disk image
+drive v:
+  file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter
+
+# 2.88 MB floppy disk image (mostly for virtual uses)
+drive w:
+  file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
+
+
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
new file mode 100644
index 000000000000..3e838be9dbe7
--- /dev/null
+++ b/arch/x86_64/boot/setup.S
@@ -0,0 +1,867 @@
+/*
+ *	setup.S		Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * setup.s is responsible for getting the system data from the BIOS,
+ * and putting them into the appropriate places in system memory.
+ * both setup.s and system has been loaded by the bootblock.
+ *
+ * This code asks the bios for memory/disk/other parameters, and
+ * puts them in a "safe" place: 0x90000-0x901FF, ie where the
+ * boot-block used to be. It is then up to the protected mode
+ * system to read them from there before the area is overwritten
+ * for buffer-blocks.
+ *
+ * Move PS/2 aux init code to psaux.c
+ * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92
+ *
+ * some changes and additional features by Christoph Niemann,
+ * March 1993/June 1994 (Christoph.Niemann@linux.org)
+ *
+ * add APM BIOS checking by Stephen Rothwell, May 1994
+ * (sfr@canb.auug.org.au)
+ *
+ * High load stuff, initrd support and position independency
+ * by Hans Lermen & Werner Almesberger, February 1996
+ * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch>
+ *
+ * Video handling moved to video.S by Martin Mares, March 1996
+ * <mj@k332.feld.cvut.cz>
+ *
+ * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david
+ * parsons) to avoid loadlin confusion, July 1997
+ *
+ * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
+ * <stiker@northlink.com>
+ *
+ * Fix to work around buggy BIOSes which dont use carry bit correctly
+ * and/or report extended memory in CX/DX for e801h memory size detection 
+ * call.  As a result the kernel got wrong figures.  The int15/e801h docs
+ * from Ralf Brown interrupt list seem to indicate AX/BX should be used
+ * anyway.  So to avoid breaking many machines (presumably there was a reason
+ * to orginally use CX/DX instead of AX/BX), we do a kludge to see
+ * if CX/DX have been changed in the e801 call and if so use AX/BX .
+ * Michael Miller, April 2001 <michaelm@mjmm.org>
+ *
+ * Added long mode checking and SSE force. March 2003, Andi Kleen.		
+ */
+
+#include <linux/config.h>
+#include <asm/segment.h>
+#include <linux/version.h>
+#include <linux/compile.h>
+#include <asm/boot.h>
+#include <asm/e820.h>
+#include <asm/page.h>
+
+/* Signature words to ensure LILO loaded us right */
+#define SIG1	0xAA55
+#define SIG2	0x5A5A
+
+INITSEG  = DEF_INITSEG		# 0x9000, we move boot here, out of the way
+SYSSEG   = DEF_SYSSEG		# 0x1000, system loaded at 0x10000 (65536).
+SETUPSEG = DEF_SETUPSEG		# 0x9020, this is the current segment
+				# ... and the former contents of CS
+
+DELTA_INITSEG = SETUPSEG - INITSEG	# 0x0020
+
+.code16
+.globl begtext, begdata, begbss, endtext, enddata, endbss
+
+.text
+begtext:
+.data
+begdata:
+.bss
+begbss:
+.text
+
+start:
+	jmp	trampoline
+
+# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
+
+		.ascii	"HdrS"		# header signature
+		.word	0x0203		# header version number (>= 0x0105)
+					# or else old loadlin-1.5 will fail)
+realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
+start_sys_seg:	.word	SYSSEG
+		.word	kernel_version	# pointing to kernel version string
+					# above section of header is compatible
+					# with loadlin-1.5 (header v1.5). Don't
+					# change it.
+
+type_of_loader:	.byte	0		# = 0, old one (LILO, Loadlin,
+					#      Bootlin, SYSLX, bootsect...)
+					# See Documentation/i386/boot.txt for
+					# assigned ids
+	
+# flags, unused bits must be zero (RFU) bit within loadflags
+loadflags:
+LOADED_HIGH	= 1			# If set, the kernel is loaded high
+CAN_USE_HEAP	= 0x80			# If set, the loader also has set
+					# heap_end_ptr to tell how much
+					# space behind setup.S can be used for
+					# heap purposes.
+					# Only the loader knows what is free
+#ifndef __BIG_KERNEL__
+		.byte	0
+#else
+		.byte	LOADED_HIGH
+#endif
+
+setup_move_size: .word  0x8000		# size to move, when setup is not
+					# loaded at 0x90000. We will move setup 
+					# to 0x90000 then just before jumping
+					# into the kernel. However, only the
+					# loader knows how much data behind
+					# us also needs to be loaded.
+
+code32_start:				# here loaders can put a different
+					# start address for 32-bit code.
+#ifndef __BIG_KERNEL__
+		.long	0x1000		#   0x1000 = default for zImage
+#else
+		.long	0x100000	# 0x100000 = default for big kernel
+#endif
+
+ramdisk_image:	.long	0		# address of loaded ramdisk image
+					# Here the loader puts the 32-bit
+					# address where it loaded the image.
+					# This only will be read by the kernel.
+
+ramdisk_size:	.long	0		# its size in bytes
+
+bootsect_kludge:
+		.long	0		# obsolete
+
+heap_end_ptr:	.word	modelist+1024	# (Header version 0x0201 or later)
+					# space from here (exclusive) down to
+					# end of setup code can be used by setup
+					# for local heap purposes.
+
+pad1:		.word	0
+cmd_line_ptr:	.long 0			# (Header version 0x0202 or later)
+					# If nonzero, a 32-bit pointer
+					# to the kernel command line.
+					# The command line should be
+					# located between the start of
+					# setup and the end of low
+					# memory (0xa0000), or it may
+					# get overwritten before it
+					# gets read.  If this field is
+					# used, there is no longer
+					# anything magical about the
+					# 0x90000 segment; the setup
+					# can be located anywhere in
+					# low memory 0x10000 or higher.
+
+ramdisk_max:	.long 0xffffffff
+	
+trampoline:	call	start_of_setup
+		.align 16
+					# The offset at this point is 0x240
+		.space  (0x7ff-0x240+1)	# E820 & EDD space (ending at 0x7ff)
+# End of setup header #####################################################
+
+start_of_setup:
+# Bootlin depends on this being done early
+	movw	$0x01500, %ax
+	movb	$0x81, %dl
+	int	$0x13
+
+#ifdef SAFE_RESET_DISK_CONTROLLER
+# Reset the disk controller.
+	movw	$0x0000, %ax
+	movb	$0x80, %dl
+	int	$0x13
+#endif
+
+# Set %ds = %cs, we know that SETUPSEG = %cs at this point
+	movw	%cs, %ax		# aka SETUPSEG
+	movw	%ax, %ds
+# Check signature at end of setup
+	cmpw	$SIG1, setup_sig1
+	jne	bad_sig
+
+	cmpw	$SIG2, setup_sig2
+	jne	bad_sig
+
+	jmp	good_sig1
+
+# Routine to print asciiz string at ds:si
+prtstr:
+	lodsb
+	andb	%al, %al
+	jz	fin
+
+	call	prtchr
+	jmp	prtstr
+
+fin:	ret
+
+# Space printing
+prtsp2:	call	prtspc		# Print double space
+prtspc:	movb	$0x20, %al	# Print single space (note: fall-thru)
+
+prtchr:	
+	pushw	%ax
+	pushw	%cx
+	movw	$0007,%bx
+	movw	$0x01, %cx
+	movb	$0x0e, %ah
+	int	$0x10
+	popw	%cx
+	popw	%ax
+	ret
+
+beep:	movb	$0x07, %al
+	jmp	prtchr
+	
+no_sig_mess: .string	"No setup signature found ..."
+
+good_sig1:
+	jmp	good_sig
+
+# We now have to find the rest of the setup code/data
+bad_sig:
+	movw	%cs, %ax			# SETUPSEG
+	subw	$DELTA_INITSEG, %ax		# INITSEG
+	movw	%ax, %ds
+	xorb	%bh, %bh
+	movb	(497), %bl			# get setup sect from bootsect
+	subw	$4, %bx				# LILO loads 4 sectors of setup
+	shlw	$8, %bx				# convert to words (1sect=2^8 words)
+	movw	%bx, %cx
+	shrw	$3, %bx				# convert to segment
+	addw	$SYSSEG, %bx
+	movw	%bx, %cs:start_sys_seg
+# Move rest of setup code/data to here
+	movw	$2048, %di			# four sectors loaded by LILO
+	subw	%si, %si
+	movw	%cs, %ax			# aka SETUPSEG
+	movw	%ax, %es
+	movw	$SYSSEG, %ax
+	movw	%ax, %ds
+	rep
+	movsw
+	movw	%cs, %ax			# aka SETUPSEG
+	movw	%ax, %ds
+	cmpw	$SIG1, setup_sig1
+	jne	no_sig
+
+	cmpw	$SIG2, setup_sig2
+	jne	no_sig
+
+	jmp	good_sig
+
+no_sig:
+	lea	no_sig_mess, %si
+	call	prtstr
+
+no_sig_loop:
+	jmp	no_sig_loop
+
+good_sig:
+	movw	%cs, %ax			# aka SETUPSEG
+	subw	$DELTA_INITSEG, %ax 		# aka INITSEG
+	movw	%ax, %ds
+# Check if an old loader tries to load a big-kernel
+	testb	$LOADED_HIGH, %cs:loadflags	# Do we have a big kernel?
+	jz	loader_ok			# No, no danger for old loaders.
+
+	cmpb	$0, %cs:type_of_loader 		# Do we have a loader that
+						# can deal with us?
+	jnz	loader_ok			# Yes, continue.
+
+	pushw	%cs				# No, we have an old loader,
+	popw	%ds				# die. 
+	lea	loader_panic_mess, %si
+	call	prtstr
+
+	jmp	no_sig_loop
+
+loader_panic_mess: .string "Wrong loader, giving up..."
+
+loader_ok:
+	/* check for long mode. */
+	/* we have to do this before the VESA setup, otherwise the user
+	   can't see the error message. */
+	
+	pushw	%ds
+	movw	%cs,%ax
+	movw	%ax,%ds
+	
+	/* minimum CPUID flags for x86-64 */
+	/* see http://www.x86-64.org/lists/discuss/msg02971.html */		
+#define SSE_MASK ((1<<25)|(1<<26))
+#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\
+					   (1<<13)|(1<<15)|(1<<24))
+#define REQUIRED_MASK2 (1<<29)
+
+	pushfl				/* standard way to check for cpuid */
+	popl	%eax
+	movl	%eax,%ebx
+	xorl	$0x200000,%eax
+	pushl	%eax
+	popfl
+	pushfl
+	popl	%eax
+	cmpl	%eax,%ebx
+	jz	no_longmode		/* cpu has no cpuid */
+	movl	$0x0,%eax
+	cpuid
+	cmpl	$0x1,%eax
+	jb	no_longmode		/* no cpuid 1 */
+	xor	%di,%di
+	cmpl	$0x68747541,%ebx	/* AuthenticAMD */
+	jnz	noamd
+	cmpl	$0x69746e65,%edx
+	jnz	noamd
+	cmpl	$0x444d4163,%ecx
+	jnz	noamd
+	mov	$1,%di			/* cpu is from AMD */
+noamd:		
+	movl    $0x1,%eax
+	cpuid
+	andl	$REQUIRED_MASK1,%edx
+	xorl	$REQUIRED_MASK1,%edx
+	jnz	no_longmode
+	movl    $0x80000000,%eax
+	cpuid
+	cmpl    $0x80000001,%eax
+	jb      no_longmode             /* no extended cpuid */
+	movl    $0x80000001,%eax
+	cpuid
+	andl    $REQUIRED_MASK2,%edx
+	xorl    $REQUIRED_MASK2,%edx
+	jnz     no_longmode
+sse_test:		
+	movl	$1,%eax
+	cpuid
+	andl	$SSE_MASK,%edx
+	cmpl	$SSE_MASK,%edx
+	je	sse_ok
+	test	%di,%di
+	jz	no_longmode	/* only try to force SSE on AMD */ 
+	movl	$0xc0010015,%ecx	/* HWCR */
+	rdmsr
+	btr	$15,%eax	/* enable SSE */
+	wrmsr
+	xor	%di,%di		/* don't loop */
+	jmp	sse_test	/* try again */	
+no_longmode:
+	call	beep
+	lea	long_mode_panic,%si
+	call	prtstr
+no_longmode_loop:		
+	jmp	no_longmode_loop
+long_mode_panic:
+	.string "Your CPU does not support long mode. Use a 32bit distribution."
+	.byte 0
+	
+sse_ok:
+	popw	%ds
+	
+# tell BIOS we want to go to long mode
+	movl  $0xec00,%eax	# declare target operating mode
+	movl  $2,%ebx		# long mode
+	int $0x15			
+	
+# Get memory size (extended mem, kB)
+
+	xorl	%eax, %eax
+	movl	%eax, (0x1e0)
+#ifndef STANDARD_MEMORY_BIOS_CALL
+	movb	%al, (E820NR)
+# Try three different memory detection schemes.  First, try
+# e820h, which lets us assemble a memory map, then try e801h,
+# which returns a 32-bit memory size, and finally 88h, which
+# returns 0-64m
+
+# method E820H:
+# the memory map from hell.  e820h returns memory classified into
+# a whole bunch of different types, and allows memory holes and
+# everything.  We scan through this memory map and build a list
+# of the first 32 memory areas, which we return at [E820MAP].
+# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm
+
+#define SMAP  0x534d4150
+
+meme820:
+	xorl	%ebx, %ebx			# continuation counter
+	movw	$E820MAP, %di			# point into the whitelist
+						# so we can have the bios
+						# directly write into it.
+
+jmpe820:
+	movl	$0x0000e820, %eax		# e820, upper word zeroed
+	movl	$SMAP, %edx			# ascii 'SMAP'
+	movl	$20, %ecx			# size of the e820rec
+	pushw	%ds				# data record.
+	popw	%es
+	int	$0x15				# make the call
+	jc	bail820				# fall to e801 if it fails
+
+	cmpl	$SMAP, %eax			# check the return is `SMAP'
+	jne	bail820				# fall to e801 if it fails
+
+#	cmpl	$1, 16(%di)			# is this usable memory?
+#	jne	again820
+
+	# If this is usable memory, we save it by simply advancing %di by
+	# sizeof(e820rec).
+	#
+good820:
+	movb	(E820NR), %al			# up to 32 entries
+	cmpb	$E820MAX, %al
+	jnl	bail820
+
+	incb	(E820NR)
+	movw	%di, %ax
+	addw	$20, %ax
+	movw	%ax, %di
+again820:
+	cmpl	$0, %ebx			# check to see if
+	jne	jmpe820				# %ebx is set to EOF
+bail820:
+
+
+# method E801H:
+# memory size is in 1k chunksizes, to avoid confusing loadlin.
+# we store the 0xe801 memory size in a completely different place,
+# because it will most likely be longer than 16 bits.
+# (use 1e0 because that's what Larry Augustine uses in his
+# alternative new memory detection scheme, and it's sensible
+# to write everything into the same place.)
+
+meme801:
+	stc					# fix to work around buggy
+	xorw	%cx,%cx				# BIOSes which dont clear/set
+	xorw	%dx,%dx				# carry on pass/error of
+						# e801h memory size call
+						# or merely pass cx,dx though
+						# without changing them.
+	movw	$0xe801, %ax
+	int	$0x15
+	jc	mem88
+
+	cmpw	$0x0, %cx			# Kludge to handle BIOSes
+	jne	e801usecxdx			# which report their extended
+	cmpw	$0x0, %dx			# memory in AX/BX rather than
+	jne	e801usecxdx			# CX/DX.  The spec I have read
+	movw	%ax, %cx			# seems to indicate AX/BX 
+	movw	%bx, %dx			# are more reasonable anyway...
+
+e801usecxdx:
+	andl	$0xffff, %edx			# clear sign extend
+	shll	$6, %edx			# and go from 64k to 1k chunks
+	movl	%edx, (0x1e0)			# store extended memory size
+	andl	$0xffff, %ecx			# clear sign extend
+ 	addl	%ecx, (0x1e0)			# and add lower memory into
+						# total size.
+
+# Ye Olde Traditional Methode.  Returns the memory size (up to 16mb or
+# 64mb, depending on the bios) in ax.
+mem88:
+
+#endif
+	movb	$0x88, %ah
+	int	$0x15
+	movw	%ax, (2)
+
+# Set the keyboard repeat rate to the max
+	movw	$0x0305, %ax
+	xorw	%bx, %bx
+	int	$0x16
+
+# Check for video adapter and its parameters and allow the
+# user to browse video modes.
+	call	video				# NOTE: we need %ds pointing
+						# to bootsector
+
+# Get hd0 data...
+	xorw	%ax, %ax
+	movw	%ax, %ds
+	ldsw	(4 * 0x41), %si
+	movw	%cs, %ax			# aka SETUPSEG
+	subw	$DELTA_INITSEG, %ax		# aka INITSEG
+	pushw	%ax
+	movw	%ax, %es
+	movw	$0x0080, %di
+	movw	$0x10, %cx
+	pushw	%cx
+	cld
+	rep
+ 	movsb
+# Get hd1 data...
+	xorw	%ax, %ax
+	movw	%ax, %ds
+	ldsw	(4 * 0x46), %si
+	popw	%cx
+	popw	%es
+	movw	$0x0090, %di
+	rep
+	movsb
+# Check that there IS a hd1 :-)
+	movw	$0x01500, %ax
+	movb	$0x81, %dl
+	int	$0x13
+	jc	no_disk1
+	
+	cmpb	$3, %ah
+	je	is_disk1
+
+no_disk1:
+	movw	%cs, %ax			# aka SETUPSEG
+	subw	$DELTA_INITSEG, %ax 		# aka INITSEG
+	movw	%ax, %es
+	movw	$0x0090, %di
+	movw	$0x10, %cx
+	xorw	%ax, %ax
+	cld
+	rep
+	stosb
+is_disk1:
+
+# Check for PS/2 pointing device
+	movw	%cs, %ax			# aka SETUPSEG
+	subw	$DELTA_INITSEG, %ax		# aka INITSEG
+	movw	%ax, %ds
+	movw	$0, (0x1ff)			# default is no pointing device
+	int	$0x11				# int 0x11: equipment list
+	testb	$0x04, %al			# check if mouse installed
+	jz	no_psmouse
+
+	movw	$0xAA, (0x1ff)			# device present
+no_psmouse:
+
+#include "../../i386/boot/edd.S"
+
+# Now we want to move to protected mode ...
+	cmpw	$0, %cs:realmode_swtch
+	jz	rmodeswtch_normal
+
+	lcall	*%cs:realmode_swtch
+
+	jmp	rmodeswtch_end
+
+rmodeswtch_normal:
+        pushw	%cs
+	call	default_switch
+
+rmodeswtch_end:
+# we get the code32 start address and modify the below 'jmpi'
+# (loader may have changed it)
+	movl	%cs:code32_start, %eax
+	movl	%eax, %cs:code32
+
+# Now we move the system to its rightful place ... but we check if we have a
+# big-kernel. In that case we *must* not move it ...
+	testb	$LOADED_HIGH, %cs:loadflags
+	jz	do_move0			# .. then we have a normal low
+						# loaded zImage
+						# .. or else we have a high
+						# loaded bzImage
+	jmp	end_move			# ... and we skip moving
+
+do_move0:
+	movw	$0x100, %ax			# start of destination segment
+	movw	%cs, %bp			# aka SETUPSEG
+	subw	$DELTA_INITSEG, %bp		# aka INITSEG
+	movw	%cs:start_sys_seg, %bx		# start of source segment
+	cld
+do_move:
+	movw	%ax, %es			# destination segment
+	incb	%ah				# instead of add ax,#0x100
+	movw	%bx, %ds			# source segment
+	addw	$0x100, %bx
+	subw	%di, %di
+	subw	%si, %si
+	movw 	$0x800, %cx
+	rep
+	movsw
+	cmpw	%bp, %bx			# assume start_sys_seg > 0x200,
+						# so we will perhaps read one
+						# page more than needed, but
+						# never overwrite INITSEG
+						# because destination is a
+						# minimum one page below source
+	jb	do_move
+
+end_move:
+# then we load the segment descriptors
+	movw	%cs, %ax			# aka SETUPSEG
+	movw	%ax, %ds
+		
+# Check whether we need to be downward compatible with version <=201
+	cmpl	$0, cmd_line_ptr
+	jne	end_move_self		# loader uses version >=202 features
+	cmpb	$0x20, type_of_loader
+	je	end_move_self		# bootsect loader, we know of it
+
+# Boot loader doesnt support boot protocol version 2.02.
+# If we have our code not at 0x90000, we need to move it there now.
+# We also then need to move the params behind it (commandline)
+# Because we would overwrite the code on the current IP, we move
+# it in two steps, jumping high after the first one.
+	movw	%cs, %ax
+	cmpw	$SETUPSEG, %ax
+	je	end_move_self
+
+	cli					# make sure we really have
+						# interrupts disabled !
+						# because after this the stack
+						# should not be used
+	subw	$DELTA_INITSEG, %ax		# aka INITSEG
+	movw	%ss, %dx
+	cmpw	%ax, %dx
+	jb	move_self_1
+
+	addw	$INITSEG, %dx
+	subw	%ax, %dx			# this will go into %ss after
+						# the move
+move_self_1:
+	movw	%ax, %ds
+	movw	$INITSEG, %ax			# real INITSEG
+	movw	%ax, %es
+	movw	%cs:setup_move_size, %cx
+	std					# we have to move up, so we use
+						# direction down because the
+						# areas may overlap
+	movw	%cx, %di
+	decw	%di
+	movw	%di, %si
+	subw	$move_self_here+0x200, %cx
+	rep
+	movsb
+	ljmp	$SETUPSEG, $move_self_here
+
+move_self_here:
+	movw	$move_self_here+0x200, %cx
+	rep
+	movsb
+	movw	$SETUPSEG, %ax
+	movw	%ax, %ds
+	movw	%dx, %ss
+end_move_self:					# now we are at the right place
+	lidt	idt_48				# load idt with 0,0
+	xorl	%eax, %eax			# Compute gdt_base
+	movw	%ds, %ax			# (Convert %ds:gdt to a linear ptr)
+	shll	$4, %eax
+	addl	$gdt, %eax
+	movl	%eax, (gdt_48+2)
+	lgdt	gdt_48				# load gdt with whatever is
+						# appropriate
+
+# that was painless, now we enable a20
+	call	empty_8042
+
+	movb	$0xD1, %al			# command write
+	outb	%al, $0x64
+	call	empty_8042
+
+	movb	$0xDF, %al			# A20 on
+	outb	%al, $0x60
+	call	empty_8042
+
+#
+#	You must preserve the other bits here. Otherwise embarrasing things
+#	like laptops powering off on boot happen. Corrected version by Kira
+#	Brown from Linux 2.2
+#
+	inb	$0x92, %al			# 
+	orb	$02, %al			# "fast A20" version
+	outb	%al, $0x92			# some chips have only this
+
+# wait until a20 really *is* enabled; it can take a fair amount of
+# time on certain systems; Toshiba Tecras are known to have this
+# problem.  The memory location used here (0x200) is the int 0x80
+# vector, which should be safe to use.
+
+	xorw	%ax, %ax			# segment 0x0000
+	movw	%ax, %fs
+	decw	%ax				# segment 0xffff (HMA)
+	movw	%ax, %gs
+a20_wait:
+	incw	%ax				# unused memory location <0xfff0
+	movw	%ax, %fs:(0x200)		# we use the "int 0x80" vector
+	cmpw	%gs:(0x210), %ax		# and its corresponding HMA addr
+	je	a20_wait			# loop until no longer aliased
+
+# make sure any possible coprocessor is properly reset..
+	xorw	%ax, %ax
+	outb	%al, $0xf0
+	call	delay
+
+	outb	%al, $0xf1
+	call	delay
+
+# well, that went ok, I hope. Now we mask all interrupts - the rest
+# is done in init_IRQ().
+	movb	$0xFF, %al			# mask all interrupts for now
+	outb	%al, $0xA1
+	call	delay
+	
+	movb	$0xFB, %al			# mask all irq's but irq2 which
+	outb	%al, $0x21			# is cascaded
+
+# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't
+# need no steenking BIOS anyway (except for the initial loading :-).
+# The BIOS-routine wants lots of unnecessary data, and it's less
+# "interesting" anyway. This is how REAL programmers do it.
+#
+# Well, now's the time to actually move into protected mode. To make
+# things as simple as possible, we do no register set-up or anything,
+# we let the gnu-compiled 32-bit programs do that. We just jump to
+# absolute address 0x1000 (or the loader supplied one),
+# in 32-bit protected mode.
+#
+# Note that the short jump isn't strictly needed, although there are
+# reasons why it might be a good idea. It won't hurt in any case.
+	movw	$1, %ax				# protected mode (PE) bit
+	lmsw	%ax				# This is it!
+	jmp	flush_instr
+
+flush_instr:
+	xorw	%bx, %bx			# Flag to indicate a boot
+	xorl	%esi, %esi			# Pointer to real-mode code
+	movw	%cs, %si
+	subw	$DELTA_INITSEG, %si
+	shll	$4, %esi			# Convert to 32-bit pointer
+# NOTE: For high loaded big kernels we need a
+#	jmpi    0x100000,__KERNEL_CS
+#
+#	but we yet haven't reloaded the CS register, so the default size 
+#	of the target offset still is 16 bit.
+#       However, using an operant prefix (0x66), the CPU will properly
+#	take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
+#	Manual, Mixing 16-bit and 32-bit code, page 16-6)
+
+	.byte 0x66, 0xea			# prefix + jmpi-opcode
+code32:	.long	0x1000				# will be set to 0x100000
+						# for big kernels
+	.word	__KERNEL_CS
+
+# Here's a bunch of information about your current kernel..
+kernel_version:	.ascii	UTS_RELEASE
+		.ascii	" ("
+		.ascii	LINUX_COMPILE_BY
+		.ascii	"@"
+		.ascii	LINUX_COMPILE_HOST
+		.ascii	") "
+		.ascii	UTS_VERSION
+		.byte	0
+
+# This is the default real mode switch routine.
+# to be called just before protected mode transition
+default_switch:
+	cli					# no interrupts allowed !
+	movb	$0x80, %al			# disable NMI for bootup
+						# sequence
+	outb	%al, $0x70
+	lret
+
+
+# This routine checks that the keyboard command queue is empty
+# (after emptying the output buffers)
+#
+# Some machines have delusions that the keyboard buffer is always full
+# with no keyboard attached...
+#
+# If there is no keyboard controller, we will usually get 0xff
+# to all the reads.  With each IO taking a microsecond and
+# a timeout of 100,000 iterations, this can take about half a
+# second ("delay" == outb to port 0x80). That should be ok,
+# and should also be plenty of time for a real keyboard controller
+# to empty.
+#
+
+empty_8042:
+	pushl	%ecx
+	movl	$100000, %ecx
+
+empty_8042_loop:
+	decl	%ecx
+	jz	empty_8042_end_loop
+
+	call	delay
+
+	inb	$0x64, %al			# 8042 status port
+	testb	$1, %al				# output buffer?
+	jz	no_output
+
+	call	delay
+	inb	$0x60, %al			# read it
+	jmp	empty_8042_loop
+
+no_output:
+	testb	$2, %al				# is input buffer full?
+	jnz	empty_8042_loop			# yes - loop
+empty_8042_end_loop:
+	popl	%ecx
+	ret
+
+# Read the cmos clock. Return the seconds in al
+gettime:
+	pushw	%cx
+	movb	$0x02, %ah
+	int	$0x1a
+	movb	%dh, %al			# %dh contains the seconds
+	andb	$0x0f, %al
+	movb	%dh, %ah
+	movb	$0x04, %cl
+	shrb	%cl, %ah
+	aad
+	popw	%cx
+	ret
+
+# Delay is needed after doing I/O
+delay:
+	outb	%al,$0x80
+	ret
+
+# Descriptor tables
+gdt:
+	.word	0, 0, 0, 0			# dummy
+
+	.word	0, 0, 0, 0			# unused
+
+	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
+	.word	0				# base address = 0
+	.word	0x9A00				# code read/exec
+	.word	0x00CF				# granularity = 4096, 386
+						#  (+5th nibble of limit)
+
+	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
+	.word	0				# base address = 0
+	.word	0x9200				# data read/write
+	.word	0x00CF				# granularity = 4096, 386
+						#  (+5th nibble of limit)
+idt_48:
+	.word	0				# idt limit = 0
+	.word	0, 0				# idt base = 0L
+gdt_48:
+	.word	0x8000				# gdt limit=2048,
+						#  256 GDT entries
+
+	.word	0, 0				# gdt base (filled in later)
+
+# Include video setup & detection code
+
+#include "video.S"
+
+# Setup signature -- must be last
+setup_sig1:	.word	SIG1
+setup_sig2:	.word	SIG2
+
+# After this point, there is some free space which is used by the video mode
+# handling code to store the temporary mode table (not used by the kernel).
+
+modelist:
+
+.text
+endtext:
+.data
+enddata:
+.bss
+endbss:
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
new file mode 100644
index 000000000000..c2fa66313170
--- /dev/null
+++ b/arch/x86_64/boot/tools/build.c
@@ -0,0 +1,186 @@
+/*
+ *  $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 1997 Martin Mares
+ */
+
+/*
+ * This file builds a disk-image from three different files:
+ *
+ * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest
+ * - setup: 8086 machine code, sets up system parm
+ * - system: 80386 code for actual system
+ *
+ * It does some checking that all files are of the correct type, and
+ * just writes the result to stdout, removing headers and padding to
+ * the right amount. It also writes some system data to stderr.
+ */
+
+/*
+ * Changes by tytso to allow root device specification
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ * Cross compiling fixes by Gertjan van Wingerde, July 1996
+ * Rewritten by Martin Mares, April 1997
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <asm/boot.h>
+
+typedef unsigned char byte;
+typedef unsigned short word;
+typedef unsigned long u32;
+
+#define DEFAULT_MAJOR_ROOT 0
+#define DEFAULT_MINOR_ROOT 0
+
+/* Minimal number of setup sectors (see also bootsect.S) */
+#define SETUP_SECTS 4
+
+byte buf[1024];
+int fd;
+int is_big_kernel;
+
+void die(const char * str, ...)
+{
+	va_list args;
+	va_start(args, str);
+	vfprintf(stderr, str, args);
+	fputc('\n', stderr);
+	exit(1);
+}
+
+void file_open(const char *name)
+{
+	if ((fd = open(name, O_RDONLY, 0)) < 0)
+		die("Unable to open `%s': %m", name);
+}
+
+void usage(void)
+{
+	die("Usage: build [-b] bootsect setup system [rootdev] [> image]");
+}
+
+int main(int argc, char ** argv)
+{
+	unsigned int i, c, sz, setup_sectors;
+	u32 sys_size;
+	byte major_root, minor_root;
+	struct stat sb;
+
+	if (argc > 2 && !strcmp(argv[1], "-b"))
+	  {
+	    is_big_kernel = 1;
+	    argc--, argv++;
+	  }
+	if ((argc < 4) || (argc > 5))
+		usage();
+	if (argc > 4) {
+		if (!strcmp(argv[4], "CURRENT")) {
+			if (stat("/", &sb)) {
+				perror("/");
+				die("Couldn't stat /");
+			}
+			major_root = major(sb.st_dev);
+			minor_root = minor(sb.st_dev);
+		} else if (strcmp(argv[4], "FLOPPY")) {
+			if (stat(argv[4], &sb)) {
+				perror(argv[4]);
+				die("Couldn't stat root device.");
+			}
+			major_root = major(sb.st_rdev);
+			minor_root = minor(sb.st_rdev);
+		} else {
+			major_root = 0;
+			minor_root = 0;
+		}
+	} else {
+		major_root = DEFAULT_MAJOR_ROOT;
+		minor_root = DEFAULT_MINOR_ROOT;
+	}
+	fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
+
+	file_open(argv[1]);
+	i = read(fd, buf, sizeof(buf));
+	fprintf(stderr,"Boot sector %d bytes.\n",i);
+	if (i != 512)
+		die("Boot block must be exactly 512 bytes");
+	if (buf[510] != 0x55 || buf[511] != 0xaa)
+		die("Boot block hasn't got boot flag (0xAA55)");
+	buf[508] = minor_root;
+	buf[509] = major_root;
+	if (write(1, buf, 512) != 512)
+		die("Write call failed");
+	close (fd);
+
+	file_open(argv[2]);				    /* Copy the setup code */
+	for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c )
+		if (write(1, buf, c) != c)
+			die("Write call failed");
+	if (c != 0)
+		die("read-error on `setup'");
+	close (fd);
+
+	setup_sectors = (i + 511) / 512;	/* Pad unused space with zeros */
+	/* for compatibility with ancient versions of LILO. */
+	if (setup_sectors < SETUP_SECTS)
+		setup_sectors = SETUP_SECTS;
+	fprintf(stderr, "Setup is %d bytes.\n", i);
+	memset(buf, 0, sizeof(buf));
+	while (i < setup_sectors * 512) {
+		c = setup_sectors * 512 - i;
+		if (c > sizeof(buf))
+			c = sizeof(buf);
+		if (write(1, buf, c) != c)
+			die("Write call failed");
+		i += c;
+	}
+
+	file_open(argv[3]);
+	if (fstat (fd, &sb))
+		die("Unable to stat `%s': %m", argv[3]);
+	sz = sb.st_size;
+	fprintf (stderr, "System is %d kB\n", sz/1024);
+	sys_size = (sz + 15) / 16;
+	/* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */
+	if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE))
+		die("System is too big. Try using %smodules.",
+			is_big_kernel ? "" : "bzImage or ");
+	while (sz > 0) {
+		int l, n;
+
+		l = (sz > sizeof(buf)) ? sizeof(buf) : sz;
+		if ((n=read(fd, buf, l)) != l) {
+			if (n < 0)
+				die("Error reading %s: %m", argv[3]);
+			else
+				die("%s: Unexpected EOF", argv[3]);
+		}
+		if (write(1, buf, l) != l)
+			die("Write failed");
+		sz -= l;
+	}
+	close(fd);
+
+	if (lseek(1, 497, SEEK_SET) != 497)		    /* Write sizes to the bootsector */
+		die("Output: seek failed");
+	buf[0] = setup_sectors;
+	if (write(1, buf, 1) != 1)
+		die("Write of setup sector count failed");
+	if (lseek(1, 500, SEEK_SET) != 500)
+		die("Output: seek failed");
+	buf[0] = (sys_size & 0xff);
+	buf[1] = ((sys_size >> 8) & 0xff);
+	if (write(1, buf, 2) != 2)
+		die("Write of image length failed");
+
+	return 0;					    /* Everything is OK */
+}
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
new file mode 100644
index 000000000000..0587477c99f2
--- /dev/null
+++ b/arch/x86_64/boot/video.S
@@ -0,0 +1,2007 @@
+/*	video.S
+ *
+ *	Display adapter & video mode setup, version 2.13 (14-May-99)
+ *
+ *	Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz>
+ *	Based on the original setup.S code (C) Linus Torvalds and Mats Anderson
+ *
+ *	Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999
+ *
+ *	For further information, look at Documentation/svga.txt.
+ *
+ */
+
+#include <linux/config.h> /* for CONFIG_VIDEO_* */
+
+/* Enable autodetection of SVGA adapters and modes. */
+#undef CONFIG_VIDEO_SVGA
+
+/* Enable autodetection of VESA modes */
+#define CONFIG_VIDEO_VESA
+
+/* Enable compacting of mode table */
+#define CONFIG_VIDEO_COMPACT
+
+/* Retain screen contents when switching modes */
+#define CONFIG_VIDEO_RETAIN
+
+/* Enable local mode list */
+#undef CONFIG_VIDEO_LOCAL
+
+/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
+#undef CONFIG_VIDEO_400_HACK
+
+/* Hack that lets you force specific BIOS mode ID and specific dimensions */
+#undef CONFIG_VIDEO_GFX_HACK
+#define VIDEO_GFX_BIOS_AX 0x4f02	/* 800x600 on ThinkPad */
+#define VIDEO_GFX_BIOS_BX 0x0102
+#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425	/* 100x37 */
+
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ *	NORMAL_VGA (-1)
+ *	EXTENDED_VGA (-2)
+ *	ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+
+/* Special video modes */
+#define VIDEO_FIRST_SPECIAL 0x0f00
+#define VIDEO_80x25 0x0f00
+#define VIDEO_8POINT 0x0f01
+#define VIDEO_80x43 0x0f02
+#define VIDEO_80x28 0x0f03
+#define VIDEO_CURRENT_MODE 0x0f04
+#define VIDEO_80x30 0x0f05
+#define VIDEO_80x34 0x0f06
+#define VIDEO_80x60 0x0f07
+#define VIDEO_GFX_HACK 0x0f08
+#define VIDEO_LAST_SPECIAL 0x0f09
+
+/* Video modes given by resolution */
+#define VIDEO_FIRST_RESOLUTION 0x1000
+
+/* The "recalculate timings" flag */
+#define VIDEO_RECALC 0x8000
+
+/* Positions of various video parameters passed to the kernel */
+/* (see also include/linux/tty.h) */
+#define PARAM_CURSOR_POS	0x00
+#define PARAM_VIDEO_PAGE	0x04
+#define PARAM_VIDEO_MODE	0x06
+#define PARAM_VIDEO_COLS	0x07
+#define PARAM_VIDEO_EGA_BX	0x0a
+#define PARAM_VIDEO_LINES	0x0e
+#define PARAM_HAVE_VGA		0x0f
+#define PARAM_FONT_POINTS	0x10
+
+#define PARAM_LFB_WIDTH		0x12
+#define PARAM_LFB_HEIGHT	0x14
+#define PARAM_LFB_DEPTH		0x16
+#define PARAM_LFB_BASE		0x18
+#define PARAM_LFB_SIZE		0x1c
+#define PARAM_LFB_LINELENGTH	0x24
+#define PARAM_LFB_COLORS	0x26
+#define PARAM_VESAPM_SEG	0x2e
+#define PARAM_VESAPM_OFF	0x30
+#define PARAM_LFB_PAGES		0x32
+#define PARAM_VESA_ATTRIB	0x34
+
+/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
+#ifdef CONFIG_VIDEO_RETAIN
+#define DO_STORE call store_screen
+#else
+#define DO_STORE
+#endif /* CONFIG_VIDEO_RETAIN */
+
+# This is the main entry point called by setup.S
+# %ds *must* be pointing to the bootsector
+video:	pushw	%ds		# We use different segments
+	pushw	%ds		# FS contains original DS
+	popw	%fs
+	pushw	%cs		# DS is equal to CS
+	popw	%ds
+	pushw	%cs		# ES is equal to CS
+	popw	%es
+	xorw	%ax, %ax
+	movw	%ax, %gs	# GS is zero
+	cld
+	call	basic_detect	# Basic adapter type testing (EGA/VGA/MDA/CGA)
+#ifdef CONFIG_VIDEO_SELECT
+	movw	%fs:(0x01fa), %ax		# User selected video mode
+	cmpw	$ASK_VGA, %ax			# Bring up the menu
+	jz	vid2
+
+	call	mode_set			# Set the mode
+	jc	vid1
+
+	leaw	badmdt, %si			# Invalid mode ID
+	call	prtstr
+vid2:	call	mode_menu
+vid1:
+#ifdef CONFIG_VIDEO_RETAIN
+	call	restore_screen			# Restore screen contents
+#endif /* CONFIG_VIDEO_RETAIN */
+	call	store_edid
+#endif /* CONFIG_VIDEO_SELECT */
+	call	mode_params			# Store mode parameters
+	popw	%ds				# Restore original DS
+	ret
+
+# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel.
+basic_detect:
+	movb	$0, %fs:(PARAM_HAVE_VGA)
+	movb	$0x12, %ah	# Check EGA/VGA
+	movb	$0x10, %bl
+	int	$0x10
+	movw	%bx, %fs:(PARAM_VIDEO_EGA_BX)	# Identifies EGA to the kernel
+	cmpb	$0x10, %bl			# No, it's a CGA/MDA/HGA card.
+	je	basret
+
+	incb	adapter
+	movw	$0x1a00, %ax			# Check EGA or VGA?
+	int	$0x10
+	cmpb	$0x1a, %al			# 1a means VGA...
+	jne	basret				# anything else is EGA.
+	
+	incb	%fs:(PARAM_HAVE_VGA)		# We've detected a VGA
+	incb	adapter
+basret:	ret
+
+# Store the video mode parameters for later usage by the kernel.
+# This is done by asking the BIOS except for the rows/columns
+# parameters in the default 80x25 mode -- these are set directly,
+# because some very obscure BIOSes supply insane values.
+mode_params:
+#ifdef CONFIG_VIDEO_SELECT
+	cmpb	$0, graphic_mode
+	jnz	mopar_gr
+#endif
+	movb	$0x03, %ah			# Read cursor position
+	xorb	%bh, %bh
+	int	$0x10
+	movw	%dx, %fs:(PARAM_CURSOR_POS)
+	movb	$0x0f, %ah			# Read page/mode/width
+	int	$0x10
+	movw	%bx, %fs:(PARAM_VIDEO_PAGE)
+	movw	%ax, %fs:(PARAM_VIDEO_MODE)	# Video mode and screen width
+	cmpb	$0x7, %al			# MDA/HGA => segment differs
+	jnz	mopar0
+
+	movw	$0xb000, video_segment
+mopar0: movw	%gs:(0x485), %ax		# Font size
+	movw	%ax, %fs:(PARAM_FONT_POINTS)	# (valid only on EGA/VGA)
+	movw	force_size, %ax			# Forced size?
+	orw	%ax, %ax
+	jz	mopar1
+
+	movb	%ah, %fs:(PARAM_VIDEO_COLS)
+	movb	%al, %fs:(PARAM_VIDEO_LINES)
+	ret
+
+mopar1:	movb	$25, %al
+	cmpb	$0, adapter			# If we are on CGA/MDA/HGA, the
+	jz	mopar2				# screen must have 25 lines.
+
+	movb	%gs:(0x484), %al		# On EGA/VGA, use the EGA+ BIOS
+	incb	%al				# location of max lines.
+mopar2: movb	%al, %fs:(PARAM_VIDEO_LINES)
+	ret
+
+#ifdef CONFIG_VIDEO_SELECT
+# Fetching of VESA frame buffer parameters
+mopar_gr:
+	leaw	modelist+1024, %di
+	movb	$0x23, %fs:(PARAM_HAVE_VGA)
+	movw	16(%di), %ax
+	movw	%ax, %fs:(PARAM_LFB_LINELENGTH)
+	movw	18(%di), %ax
+	movw	%ax, %fs:(PARAM_LFB_WIDTH)
+	movw	20(%di), %ax
+	movw	%ax, %fs:(PARAM_LFB_HEIGHT)
+	movb	25(%di), %al
+	movb	$0, %ah
+	movw	%ax, %fs:(PARAM_LFB_DEPTH)
+	movb	29(%di), %al	
+	movb	$0, %ah
+	movw	%ax, %fs:(PARAM_LFB_PAGES)
+	movl	40(%di), %eax
+	movl	%eax, %fs:(PARAM_LFB_BASE)
+	movl	31(%di), %eax
+	movl	%eax, %fs:(PARAM_LFB_COLORS)
+	movl	35(%di), %eax
+	movl	%eax, %fs:(PARAM_LFB_COLORS+4)
+	movw	0(%di), %ax
+	movw	%ax, %fs:(PARAM_VESA_ATTRIB)
+
+# get video mem size
+	leaw	modelist+1024, %di
+	movw	$0x4f00, %ax
+	int	$0x10
+	xorl	%eax, %eax
+	movw	18(%di), %ax
+	movl	%eax, %fs:(PARAM_LFB_SIZE)
+
+# switching the DAC to 8-bit is for <= 8 bpp only
+	movw	%fs:(PARAM_LFB_DEPTH), %ax
+	cmpw	$8, %ax
+	jg	dac_done
+
+# get DAC switching capability
+	xorl	%eax, %eax
+	movb	10(%di), %al
+	testb	$1, %al
+	jz	dac_set
+
+# attempt to switch DAC to 8-bit
+	movw	$0x4f08, %ax
+	movw	$0x0800, %bx
+	int	$0x10
+	cmpw	$0x004f, %ax
+	jne     dac_set
+	movb    %bh, dac_size		# store actual DAC size
+
+dac_set:
+# set color size to DAC size
+	movb	dac_size, %al
+	movb	%al, %fs:(PARAM_LFB_COLORS+0)
+	movb	%al, %fs:(PARAM_LFB_COLORS+2)
+	movb	%al, %fs:(PARAM_LFB_COLORS+4)
+	movb	%al, %fs:(PARAM_LFB_COLORS+6)
+
+# set color offsets to 0
+	movb	$0, %fs:(PARAM_LFB_COLORS+1)
+	movb	$0, %fs:(PARAM_LFB_COLORS+3)
+	movb	$0, %fs:(PARAM_LFB_COLORS+5)
+	movb	$0, %fs:(PARAM_LFB_COLORS+7)
+
+dac_done:
+# get protected mode interface informations
+	movw	$0x4f0a, %ax
+	xorw	%bx, %bx
+	xorw	%di, %di
+	int	$0x10
+	cmp	$0x004f, %ax
+	jnz	no_pm
+
+	movw	%es, %fs:(PARAM_VESAPM_SEG)
+	movw	%di, %fs:(PARAM_VESAPM_OFF)
+no_pm:	ret
+
+# The video mode menu
+mode_menu:
+	leaw	keymsg, %si			# "Return/Space/Timeout" message
+	call	prtstr
+	call	flush
+nokey:	call	getkt
+
+	cmpb	$0x0d, %al			# ENTER ?
+	je	listm				# yes - manual mode selection
+
+	cmpb	$0x20, %al			# SPACE ?
+	je	defmd1				# no - repeat
+
+	call 	beep
+	jmp	nokey
+
+defmd1:	ret					# No mode chosen? Default 80x25
+
+listm:	call	mode_table			# List mode table
+listm0:	leaw	name_bann, %si			# Print adapter name
+	call	prtstr
+	movw	card_name, %si
+	orw	%si, %si
+	jnz	an2
+
+	movb	adapter, %al
+	leaw	old_name, %si
+	orb	%al, %al
+	jz	an1
+
+	leaw	ega_name, %si
+	decb	%al
+	jz	an1
+
+	leaw	vga_name, %si
+	jmp	an1
+
+an2:	call	prtstr
+	leaw	svga_name, %si
+an1:	call	prtstr
+	leaw	listhdr, %si			# Table header
+	call	prtstr
+	movb	$0x30, %dl			# DL holds mode number
+	leaw	modelist, %si
+lm1:	cmpw	$ASK_VGA, (%si)			# End?
+	jz	lm2
+
+	movb	%dl, %al			# Menu selection number
+	call	prtchr
+	call	prtsp2
+	lodsw
+	call	prthw				# Mode ID
+	call	prtsp2
+	movb	0x1(%si), %al
+	call	prtdec				# Rows
+	movb	$0x78, %al			# the letter 'x'
+	call	prtchr
+	lodsw
+	call	prtdec				# Columns
+	movb	$0x0d, %al			# New line
+	call	prtchr
+	movb	$0x0a, %al
+	call	prtchr
+	incb	%dl				# Next character
+	cmpb	$0x3a, %dl
+	jnz	lm1
+
+	movb	$0x61, %dl
+	jmp	lm1
+
+lm2:	leaw	prompt, %si			# Mode prompt
+	call	prtstr
+	leaw	edit_buf, %di			# Editor buffer
+lm3:	call	getkey
+	cmpb	$0x0d, %al			# Enter?
+	jz	lment
+
+	cmpb	$0x08, %al			# Backspace?
+	jz	lmbs
+
+	cmpb	$0x20, %al			# Printable?
+	jc	lm3
+
+	cmpw	$edit_buf+4, %di		# Enough space?
+	jz	lm3
+
+	stosb
+	call	prtchr
+	jmp	lm3
+
+lmbs:	cmpw	$edit_buf, %di			# Backspace
+	jz	lm3
+
+	decw	%di
+	movb	$0x08, %al
+	call	prtchr
+	call	prtspc
+	movb	$0x08, %al
+	call	prtchr
+	jmp	lm3
+	
+lment:	movb	$0, (%di)
+	leaw	crlft, %si
+	call	prtstr
+	leaw	edit_buf, %si
+	cmpb	$0, (%si)			# Empty string = default mode
+	jz	lmdef
+
+	cmpb	$0, 1(%si)			# One character = menu selection
+	jz	mnusel
+
+	cmpw	$0x6373, (%si)			# "scan" => mode scanning
+	jnz	lmhx
+
+	cmpw	$0x6e61, 2(%si)
+	jz	lmscan
+
+lmhx:	xorw	%bx, %bx			# Else => mode ID in hex
+lmhex:	lodsb
+	orb	%al, %al
+	jz	lmuse1
+
+	subb	$0x30, %al
+	jc	lmbad
+
+	cmpb	$10, %al
+	jc	lmhx1
+
+	subb	$7, %al
+	andb	$0xdf, %al
+	cmpb	$10, %al
+	jc	lmbad
+
+	cmpb	$16, %al
+	jnc	lmbad
+
+lmhx1:	shlw	$4, %bx
+	orb	%al, %bl
+	jmp	lmhex
+
+lmuse1:	movw	%bx, %ax
+	jmp	lmuse
+
+mnusel:	lodsb					# Menu selection
+	xorb	%ah, %ah
+	subb	$0x30, %al
+	jc	lmbad
+
+	cmpb	$10, %al
+	jc	lmuse
+	
+	cmpb	$0x61-0x30, %al
+	jc	lmbad
+	
+	subb	$0x61-0x30-10, %al
+	cmpb	$36, %al
+	jnc	lmbad
+
+lmuse:	call	mode_set
+	jc	lmdef
+
+lmbad:	leaw	unknt, %si
+	call	prtstr
+	jmp	lm2
+lmscan:	cmpb	$0, adapter			# Scanning only on EGA/VGA
+	jz	lmbad
+
+	movw	$0, mt_end			# Scanning of modes is
+	movb	$1, scanning			# done as new autodetection.
+	call	mode_table
+	jmp	listm0
+lmdef:	ret
+
+# Additional parts of mode_set... (relative jumps, you know)
+setv7:						# Video7 extended modes
+	DO_STORE
+	subb	$VIDEO_FIRST_V7>>8, %bh
+	movw	$0x6f05, %ax
+	int	$0x10
+	stc
+	ret
+
+_setrec:	jmp	setrec			# Ugly...
+_set_80x25:	jmp	set_80x25
+
+# Aliases for backward compatibility.
+setalias:
+	movw	$VIDEO_80x25, %ax
+	incw	%bx
+	jz	mode_set
+
+	movb	$VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al
+	incw	%bx
+	jnz	setbad				# Fall-through!
+
+# Setting of user mode (AX=mode ID) => CF=success
+mode_set:
+	movw	%ax, %fs:(0x01fa)		# Store mode for use in acpi_wakeup.S
+	movw	%ax, %bx
+	cmpb	$0xff, %ah
+	jz	setalias
+
+	testb	$VIDEO_RECALC>>8, %ah
+	jnz	_setrec
+
+	cmpb	$VIDEO_FIRST_RESOLUTION>>8, %ah
+	jnc	setres
+	
+	cmpb	$VIDEO_FIRST_SPECIAL>>8, %ah
+	jz	setspc
+	
+	cmpb	$VIDEO_FIRST_V7>>8, %ah
+	jz	setv7
+	
+	cmpb	$VIDEO_FIRST_VESA>>8, %ah
+	jnc	check_vesa
+	
+	orb	%ah, %ah
+	jz	setmenu
+	
+	decb	%ah
+	jz	setbios
+
+setbad:	clc
+	movb	$0, do_restore			# The screen needn't be restored
+	ret
+
+setvesa:
+	DO_STORE
+	subb	$VIDEO_FIRST_VESA>>8, %bh
+	movw	$0x4f02, %ax			# VESA BIOS mode set call
+	int	$0x10
+	cmpw	$0x004f, %ax			# AL=4f if implemented
+	jnz	setbad				# AH=0 if OK
+
+	stc
+	ret
+
+setbios:
+	DO_STORE
+	int	$0x10				# Standard BIOS mode set call
+	pushw	%bx
+	movb	$0x0f, %ah			# Check if really set
+	int	$0x10
+	popw	%bx
+	cmpb	%bl, %al
+	jnz	setbad
+	
+	stc
+	ret
+
+setspc:	xorb	%bh, %bh			# Set special mode
+	cmpb	$VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl
+	jnc	setbad
+	
+	addw	%bx, %bx
+	jmp	*spec_inits(%bx)
+
+setmenu:
+	orb	%al, %al			# 80x25 is an exception
+	jz	_set_80x25
+	
+	pushw	%bx				# Set mode chosen from menu
+	call	mode_table			# Build the mode table
+	popw	%ax
+	shlw	$2, %ax
+	addw	%ax, %si
+	cmpw	%di, %si
+	jnc	setbad
+	
+	movw	(%si), %ax			# Fetch mode ID
+_m_s:	jmp	mode_set
+
+setres:	pushw	%bx				# Set mode chosen by resolution
+	call	mode_table
+	popw	%bx
+	xchgb	%bl, %bh
+setr1:	lodsw
+	cmpw	$ASK_VGA, %ax			# End of the list?
+	jz	setbad
+	
+	lodsw
+	cmpw	%bx, %ax
+	jnz	setr1
+	
+	movw	-4(%si), %ax			# Fetch mode ID
+	jmp	_m_s
+
+check_vesa:
+	leaw	modelist+1024, %di
+	subb	$VIDEO_FIRST_VESA>>8, %bh
+	movw	%bx, %cx			# Get mode information structure
+	movw	$0x4f01, %ax
+	int	$0x10
+	addb	$VIDEO_FIRST_VESA>>8, %bh
+	cmpw	$0x004f, %ax
+	jnz	setbad
+
+	movb	(%di), %al			# Check capabilities.
+	andb	$0x19, %al
+	cmpb	$0x09, %al
+	jz	setvesa				# This is a text mode
+
+	movb	(%di), %al			# Check capabilities.
+	andb	$0x99, %al
+	cmpb	$0x99, %al
+	jnz	_setbad				# Doh! No linear frame buffer.
+
+	subb	$VIDEO_FIRST_VESA>>8, %bh
+	orw	$0x4000, %bx			# Use linear frame buffer
+	movw	$0x4f02, %ax			# VESA BIOS mode set call
+	int	$0x10
+	cmpw	$0x004f, %ax			# AL=4f if implemented
+	jnz	_setbad				# AH=0 if OK
+
+	movb	$1, graphic_mode		# flag graphic mode
+	movb	$0, do_restore			# no screen restore
+	stc
+	ret
+
+_setbad:	jmp	setbad          	# Ugly...
+
+# Recalculate vertical display end registers -- this fixes various
+# inconsistencies of extended modes on many adapters. Called when
+# the VIDEO_RECALC flag is set in the mode ID.
+
+setrec:	subb	$VIDEO_RECALC>>8, %ah		# Set the base mode
+	call	mode_set
+	jnc	rct3
+
+	movw	%gs:(0x485), %ax		# Font size in pixels
+	movb	%gs:(0x484), %bl		# Number of rows
+	incb	%bl
+	mulb	%bl				# Number of visible
+	decw	%ax				# scan lines - 1
+	movw	$0x3d4, %dx
+	movw	%ax, %bx
+	movb	$0x12, %al			# Lower 8 bits
+	movb	%bl, %ah
+	outw	%ax, %dx
+	movb	$0x07, %al		# Bits 8 and 9 in the overflow register
+	call	inidx
+	xchgb	%al, %ah
+	andb	$0xbd, %ah
+	shrb	%bh
+	jnc	rct1
+	orb	$0x02, %ah
+rct1:	shrb	%bh
+	jnc	rct2
+	orb	$0x40, %ah
+rct2:	movb	$0x07, %al
+	outw	%ax, %dx
+	stc
+rct3:	ret
+
+# Table of routines for setting of the special modes.
+spec_inits:
+	.word	set_80x25
+	.word	set_8pixel
+	.word	set_80x43
+	.word	set_80x28
+	.word	set_current
+	.word	set_80x30
+	.word	set_80x34
+	.word	set_80x60
+	.word	set_gfx
+
+# Set the 80x25 mode. If already set, do nothing.
+set_80x25:
+	movw	$0x5019, force_size		# Override possibly broken BIOS
+use_80x25:
+#ifdef CONFIG_VIDEO_400_HACK
+	movw	$0x1202, %ax			# Force 400 scan lines
+	movb	$0x30, %bl
+	int	$0x10
+#else
+	movb	$0x0f, %ah			# Get current mode ID
+	int	$0x10
+	cmpw	$0x5007, %ax	# Mode 7 (80x25 mono) is the only one available
+	jz	st80		# on CGA/MDA/HGA and is also available on EGAM
+
+	cmpw	$0x5003, %ax	# Unknown mode, force 80x25 color
+	jnz	force3
+
+st80:	cmpb	$0, adapter	# CGA/MDA/HGA => mode 3/7 is always 80x25
+	jz	set80
+
+	movb	%gs:(0x0484), %al	# This is EGA+ -- beware of 80x50 etc.
+	orb	%al, %al		# Some buggy BIOS'es set 0 rows
+	jz	set80
+	
+	cmpb	$24, %al		# It's hopefully correct
+	jz	set80
+#endif /* CONFIG_VIDEO_400_HACK */
+force3:	DO_STORE
+	movw	$0x0003, %ax			# Forced set
+	int	$0x10
+set80:	stc
+	ret
+
+# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls.
+set_8pixel:
+	DO_STORE
+	call	use_80x25			# The base is 80x25
+set_8pt:
+	movw	$0x1112, %ax			# Use 8x8 font
+	xorb	%bl, %bl
+	int	$0x10
+	movw	$0x1200, %ax			# Use alternate print screen
+	movb	$0x20, %bl
+	int	$0x10
+	movw	$0x1201, %ax			# Turn off cursor emulation
+	movb	$0x34, %bl
+	int	$0x10
+	movb	$0x01, %ah			# Define cursor scan lines 6-7
+	movw	$0x0607, %cx
+	int	$0x10
+set_current:
+	stc
+	ret
+
+# Set the 80x28 mode. This mode works on all VGA's, because it's a standard
+# 80x25 mode with 14-point fonts instead of 16-point.
+set_80x28:
+	DO_STORE
+	call	use_80x25			# The base is 80x25
+set14:	movw	$0x1111, %ax			# Use 9x14 font
+	xorb	%bl, %bl
+	int	$0x10
+	movb	$0x01, %ah			# Define cursor scan lines 11-12
+	movw	$0x0b0c, %cx
+	int	$0x10
+	stc
+	ret
+
+# Set the 80x43 mode. This mode is works on all VGA's.
+# It's a 350-scanline mode with 8-pixel font.
+set_80x43:
+	DO_STORE
+	movw	$0x1201, %ax			# Set 350 scans
+	movb	$0x30, %bl
+	int	$0x10
+	movw	$0x0003, %ax			# Reset video mode
+	int	$0x10
+	jmp	set_8pt				# Use 8-pixel font
+
+# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font.
+set_80x30:
+	call	use_80x25			# Start with real 80x25
+	DO_STORE
+	movw	$0x3cc, %dx			# Get CRTC port
+	inb	%dx, %al
+	movb	$0xd4, %dl
+	rorb	%al				# Mono or color?
+	jc	set48a
+
+	movb	$0xb4, %dl
+set48a:	movw	$0x0c11, %ax		# Vertical sync end (also unlocks CR0-7)
+ 	call	outidx
+	movw	$0x0b06, %ax			# Vertical total
+ 	call	outidx
+	movw	$0x3e07, %ax			# (Vertical) overflow
+ 	call	outidx
+	movw	$0xea10, %ax			# Vertical sync start
+ 	call	outidx
+	movw	$0xdf12, %ax			# Vertical display end
+	call	outidx
+	movw	$0xe715, %ax			# Vertical blank start
+ 	call	outidx
+	movw	$0x0416, %ax			# Vertical blank end
+ 	call	outidx
+	pushw	%dx
+	movb	$0xcc, %dl			# Misc output register (read)
+ 	inb	%dx, %al
+ 	movb	$0xc2, %dl			# (write)
+ 	andb	$0x0d, %al	# Preserve clock select bits and color bit
+ 	orb	$0xe2, %al			# Set correct sync polarity
+ 	outb	%al, %dx
+	popw	%dx
+	movw	$0x501e, force_size
+	stc					# That's all.
+	ret
+
+# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font.
+set_80x34:
+	call	set_80x30			# Set 480 scans
+	call	set14				# And 14-pt font
+	movw	$0xdb12, %ax			# VGA vertical display end
+	movw	$0x5022, force_size
+setvde:	call	outidx
+	stc
+	ret
+
+# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font.
+set_80x60:
+	call	set_80x30			# Set 480 scans
+	call	set_8pt				# And 8-pt font
+	movw	$0xdf12, %ax			# VGA vertical display end
+	movw	$0x503c, force_size
+	jmp	setvde
+
+# Special hack for ThinkPad graphics
+set_gfx:
+#ifdef CONFIG_VIDEO_GFX_HACK
+	movw	$VIDEO_GFX_BIOS_AX, %ax
+	movw	$VIDEO_GFX_BIOS_BX, %bx
+	int	$0x10
+	movw	$VIDEO_GFX_DUMMY_RESOLUTION, force_size
+	stc
+#endif
+	ret
+
+#ifdef CONFIG_VIDEO_RETAIN
+
+# Store screen contents to temporary buffer.
+store_screen:
+	cmpb	$0, do_restore			# Already stored?
+	jnz	stsr
+
+	testb	$CAN_USE_HEAP, loadflags	# Have we space for storing?
+	jz	stsr
+	
+	pushw	%ax
+	pushw	%bx
+	pushw	force_size			# Don't force specific size
+	movw	$0, force_size
+	call	mode_params			# Obtain params of current mode
+	popw	force_size
+	movb	%fs:(PARAM_VIDEO_LINES), %ah
+	movb	%fs:(PARAM_VIDEO_COLS), %al
+	movw	%ax, %bx			# BX=dimensions
+	mulb	%ah
+	movw	%ax, %cx			# CX=number of characters
+	addw	%ax, %ax			# Calculate image size
+	addw	$modelist+1024+4, %ax
+	cmpw	heap_end_ptr, %ax
+	jnc	sts1				# Unfortunately, out of memory
+
+	movw	%fs:(PARAM_CURSOR_POS), %ax	# Store mode params
+	leaw	modelist+1024, %di
+	stosw
+	movw	%bx, %ax
+	stosw
+	pushw	%ds				# Store the screen
+	movw	video_segment, %ds
+	xorw	%si, %si
+	rep
+	movsw
+	popw	%ds
+	incb	do_restore			# Screen will be restored later
+sts1:	popw	%bx
+	popw	%ax
+stsr:	ret
+
+# Restore screen contents from temporary buffer.
+restore_screen:
+	cmpb	$0, do_restore			# Has the screen been stored?
+	jz	res1
+
+	call	mode_params			# Get parameters of current mode
+	movb	%fs:(PARAM_VIDEO_LINES), %cl
+	movb	%fs:(PARAM_VIDEO_COLS), %ch
+	leaw	modelist+1024, %si		# Screen buffer
+	lodsw					# Set cursor position
+	movw	%ax, %dx
+	cmpb	%cl, %dh
+	jc	res2
+	
+	movb	%cl, %dh
+	decb	%dh
+res2:	cmpb	%ch, %dl
+	jc	res3
+	
+	movb	%ch, %dl
+	decb	%dl
+res3:	movb	$0x02, %ah
+	movb	$0x00, %bh
+	int	$0x10
+	lodsw					# Display size
+	movb	%ah, %dl			# DL=number of lines
+	movb	$0, %ah				# BX=phys. length of orig. line
+	movw	%ax, %bx
+	cmpb	%cl, %dl			# Too many?
+	jc	res4
+
+	pushw	%ax
+	movb	%dl, %al
+	subb	%cl, %al
+	mulb	%bl
+	addw	%ax, %si
+	addw	%ax, %si
+	popw	%ax
+	movb	%cl, %dl
+res4:	cmpb	%ch, %al			# Too wide?
+	jc	res5
+	
+	movb	%ch, %al			# AX=width of src. line
+res5:	movb	$0, %cl
+	xchgb	%ch, %cl
+	movw	%cx, %bp			# BP=width of dest. line
+	pushw	%es
+	movw	video_segment, %es
+	xorw	%di, %di			# Move the data
+	addw	%bx, %bx			# Convert BX and BP to _bytes_
+	addw	%bp, %bp
+res6:	pushw	%si
+	pushw	%di
+	movw	%ax, %cx
+	rep
+	movsw
+	popw	%di
+	popw	%si
+	addw	%bp, %di
+	addw	%bx, %si
+	decb	%dl
+	jnz	res6
+	
+	popw	%es				# Done
+res1:	ret
+#endif /* CONFIG_VIDEO_RETAIN */
+
+# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port)
+outidx:	outb	%al, %dx
+	pushw	%ax
+	movb	%ah, %al
+	incw	%dx
+	outb	%al, %dx
+	decw	%dx
+	popw	%ax
+	ret
+
+# Build the table of video modes (stored after the setup.S code at the
+# `modelist' label. Each video mode record looks like:
+#	.word	MODE-ID		(our special mode ID (see above))
+#	.byte	rows		(number of rows)
+#	.byte	columns		(number of columns)
+# Returns address of the end of the table in DI, the end is marked
+# with a ASK_VGA ID.
+mode_table:
+	movw	mt_end, %di			# Already filled?
+	orw	%di, %di
+	jnz	mtab1x
+	
+	leaw	modelist, %di			# Store standard modes:
+	movl	$VIDEO_80x25 + 0x50190000, %eax	# The 80x25 mode (ALL)
+	stosl
+	movb	adapter, %al			# CGA/MDA/HGA -- no more modes
+	orb	%al, %al
+	jz	mtabe
+	
+	decb	%al
+	jnz	mtabv
+	
+	movl	$VIDEO_8POINT + 0x502b0000, %eax	# The 80x43 EGA mode
+	stosl
+	jmp	mtabe
+
+mtab1x:	jmp	mtab1
+
+mtabv:	leaw	vga_modes, %si			# All modes for std VGA
+	movw	$vga_modes_end-vga_modes, %cx
+	rep	# I'm unable to use movsw as I don't know how to store a half
+	movsb	# of the expression above to cx without using explicit shr.
+
+	cmpb	$0, scanning			# Mode scan requested?
+	jz	mscan1
+	
+	call	mode_scan
+mscan1:
+
+#ifdef CONFIG_VIDEO_LOCAL
+	call	local_modes
+#endif /* CONFIG_VIDEO_LOCAL */
+
+#ifdef CONFIG_VIDEO_VESA
+	call	vesa_modes			# Detect VESA VGA modes
+#endif /* CONFIG_VIDEO_VESA */
+
+#ifdef CONFIG_VIDEO_SVGA
+	cmpb	$0, scanning			# Bypass when scanning
+	jnz	mscan2
+	
+	call	svga_modes			# Detect SVGA cards & modes
+mscan2:
+#endif /* CONFIG_VIDEO_SVGA */
+
+mtabe:
+
+#ifdef CONFIG_VIDEO_COMPACT
+	leaw	modelist, %si
+	movw	%di, %dx
+	movw	%si, %di
+cmt1:	cmpw	%dx, %si			# Scan all modes
+	jz	cmt2
+
+	leaw	modelist, %bx			# Find in previous entries
+	movw	2(%si), %cx
+cmt3:	cmpw	%bx, %si
+	jz	cmt4
+
+	cmpw	2(%bx), %cx			# Found => don't copy this entry
+	jz	cmt5
+
+	addw	$4, %bx
+	jmp	cmt3
+
+cmt4:	movsl					# Copy entry
+	jmp	cmt1
+
+cmt5:	addw	$4, %si				# Skip entry
+	jmp	cmt1
+
+cmt2:
+#endif	/* CONFIG_VIDEO_COMPACT */
+
+	movw	$ASK_VGA, (%di)			# End marker
+	movw	%di, mt_end
+mtab1:	leaw	modelist, %si			# SI=mode list, DI=list end
+ret0:	ret
+
+# Modes usable on all standard VGAs
+vga_modes:
+	.word	VIDEO_8POINT
+	.word	0x5032				# 80x50
+	.word	VIDEO_80x43
+	.word	0x502b				# 80x43
+	.word	VIDEO_80x28
+	.word	0x501c				# 80x28
+	.word	VIDEO_80x30
+	.word	0x501e				# 80x30
+	.word	VIDEO_80x34
+	.word	0x5022				# 80x34
+	.word	VIDEO_80x60
+	.word	0x503c				# 80x60
+#ifdef CONFIG_VIDEO_GFX_HACK
+	.word	VIDEO_GFX_HACK
+	.word	VIDEO_GFX_DUMMY_RESOLUTION
+#endif
+
+vga_modes_end:
+# Detect VESA modes.
+
+#ifdef CONFIG_VIDEO_VESA
+vesa_modes:
+	cmpb	$2, adapter			# VGA only
+	jnz	ret0
+
+	movw	%di, %bp			# BP=original mode table end
+	addw	$0x200, %di			# Buffer space
+	movw	$0x4f00, %ax			# VESA Get card info call
+	int	$0x10
+	movw	%bp, %di
+	cmpw	$0x004f, %ax			# Successful?
+	jnz	ret0
+	
+	cmpw	$0x4556, 0x200(%di)
+	jnz	ret0
+	
+	cmpw	$0x4153, 0x202(%di)
+	jnz	ret0
+	
+	movw	$vesa_name, card_name		# Set name to "VESA VGA"
+	pushw	%gs
+	lgsw	0x20e(%di), %si			# GS:SI=mode list
+	movw	$128, %cx			# Iteration limit
+vesa1:
+# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst.
+# XXX:	lodsw	%gs:(%si), %ax			# Get next mode in the list
+	gs; lodsw
+	cmpw	$0xffff, %ax			# End of the table?
+	jz	vesar
+	
+	cmpw	$0x0080, %ax			# Check validity of mode ID
+	jc	vesa2
+	
+	orb	%ah, %ah		# Valid IDs: 0x0000-0x007f/0x0100-0x07ff
+	jz	vesan			# Certain BIOSes report 0x80-0xff!
+
+	cmpw	$0x0800, %ax
+	jnc	vesae
+
+vesa2:	pushw	%cx
+	movw	%ax, %cx			# Get mode information structure
+	movw	$0x4f01, %ax
+	int	$0x10
+	movw	%cx, %bx			# BX=mode number
+	addb	$VIDEO_FIRST_VESA>>8, %bh
+	popw	%cx
+	cmpw	$0x004f, %ax
+	jnz	vesan			# Don't report errors (buggy BIOSES)
+
+	movb	(%di), %al			# Check capabilities. We require
+	andb	$0x19, %al			# a color text mode.
+	cmpb	$0x09, %al
+	jnz	vesan
+	
+	cmpw	$0xb800, 8(%di)		# Standard video memory address required
+	jnz	vesan
+
+	testb	$2, (%di)			# Mode characteristics supplied?
+	movw	%bx, (%di)			# Store mode number
+	jz	vesa3
+	
+	xorw	%dx, %dx
+	movw	0x12(%di), %bx			# Width
+	orb	%bh, %bh
+	jnz	vesan
+	
+	movb	%bl, 0x3(%di)
+	movw	0x14(%di), %ax			# Height
+	orb	%ah, %ah
+	jnz	vesan
+	
+	movb	%al, 2(%di)
+	mulb	%bl
+	cmpw	$8193, %ax		# Small enough for Linux console driver?
+	jnc	vesan
+
+	jmp	vesaok
+
+vesa3:	subw	$0x8108, %bx	# This mode has no detailed info specified,
+	jc	vesan		# so it must be a standard VESA mode.
+
+	cmpw	$5, %bx
+	jnc	vesan
+
+	movw	vesa_text_mode_table(%bx), %ax
+	movw	%ax, 2(%di)
+vesaok:	addw	$4, %di				# The mode is valid. Store it.
+vesan:	loop	vesa1			# Next mode. Limit exceeded => error
+vesae:	leaw	vesaer, %si
+	call	prtstr
+	movw	%bp, %di			# Discard already found modes.
+vesar:	popw	%gs
+	ret
+
+# Dimensions of standard VESA text modes
+vesa_text_mode_table:
+	.byte	60, 80				# 0108
+	.byte	25, 132				# 0109
+	.byte	43, 132				# 010A
+	.byte	50, 132				# 010B
+	.byte	60, 132				# 010C
+#endif	/* CONFIG_VIDEO_VESA */
+
+# Scan for video modes. A bit dirty, but should work.
+mode_scan:
+	movw	$0x0100, %cx			# Start with mode 0
+scm1:	movb	$0, %ah				# Test the mode
+	movb	%cl, %al
+	int	$0x10
+	movb	$0x0f, %ah
+	int	$0x10
+	cmpb	%cl, %al
+	jnz	scm2				# Mode not set
+
+	movw	$0x3c0, %dx			# Test if it's a text mode
+	movb	$0x10, %al			# Mode bits
+	call	inidx
+	andb	$0x03, %al
+	jnz	scm2
+	
+	movb	$0xce, %dl			# Another set of mode bits
+	movb	$0x06, %al
+	call	inidx
+	shrb	%al
+	jc	scm2
+	
+	movb	$0xd4, %dl			# Cursor location
+	movb	$0x0f, %al
+	call	inidx
+	orb	%al, %al
+	jnz	scm2
+	
+	movw	%cx, %ax			# Ok, store the mode
+	stosw
+	movb	%gs:(0x484), %al		# Number of rows
+	incb	%al
+	stosb
+	movw	%gs:(0x44a), %ax		# Number of columns
+	stosb
+scm2:	incb	%cl
+	jns	scm1
+	
+	movw	$0x0003, %ax			# Return back to mode 3
+	int	$0x10
+	ret
+
+tstidx:	outw	%ax, %dx			# OUT DX,AX and inidx
+inidx:	outb	%al, %dx			# Read from indexed VGA register
+	incw	%dx			# AL=index, DX=index reg port -> AL=data
+	inb	%dx, %al
+	decw	%dx
+	ret
+
+# Try to detect type of SVGA card and supply (usually approximate) video
+# mode table for it.
+
+#ifdef CONFIG_VIDEO_SVGA
+svga_modes:
+	leaw	svga_table, %si			# Test all known SVGA adapters
+dosvga:	lodsw
+	movw	%ax, %bp			# Default mode table
+	orw	%ax, %ax
+	jz	didsv1
+
+	lodsw					# Pointer to test routine
+	pushw	%si
+	pushw	%di
+	pushw	%es
+	movw	$0xc000, %bx
+	movw	%bx, %es
+	call	*%ax				# Call test routine
+	popw	%es
+	popw	%di
+	popw	%si
+	orw	%bp, %bp
+	jz	dosvga
+	
+	movw	%bp, %si			# Found, copy the modes
+	movb	svga_prefix, %ah
+cpsvga:	lodsb
+	orb	%al, %al
+	jz	didsv
+	
+	stosw
+	movsw
+	jmp	cpsvga
+
+didsv:	movw	%si, card_name			# Store pointer to card name
+didsv1:	ret
+
+# Table of all known SVGA cards. For each card, we store a pointer to
+# a table of video modes supported by the card and a pointer to a routine
+# used for testing of presence of the card. The video mode table is always
+# followed by the name of the card or the chipset.
+svga_table:
+	.word	ati_md, ati_test
+	.word	oak_md, oak_test
+	.word	paradise_md, paradise_test
+	.word	realtek_md, realtek_test
+	.word	s3_md, s3_test
+	.word	chips_md, chips_test
+	.word	video7_md, video7_test
+	.word	cirrus5_md, cirrus5_test
+	.word	cirrus6_md, cirrus6_test
+	.word	cirrus1_md, cirrus1_test
+	.word	ahead_md, ahead_test
+	.word	everex_md, everex_test
+	.word	genoa_md, genoa_test
+	.word	trident_md, trident_test
+	.word	tseng_md, tseng_test
+	.word	0
+
+# Test routines and mode tables:
+
+# S3 - The test algorithm was taken from the SuperProbe package
+# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org
+s3_test:
+	movw	$0x0f35, %cx	# we store some constants in cl/ch
+	movw	$0x03d4, %dx
+	movb	$0x38, %al
+	call	inidx
+	movb	%al, %bh	# store current CRT-register 0x38
+	movw	$0x0038, %ax
+	call	outidx		# disable writing to special regs
+	movb	%cl, %al	# check whether we can write special reg 0x35
+	call	inidx
+	movb	%al, %bl	# save the current value of CRT reg 0x35
+	andb	$0xf0, %al	# clear bits 0-3
+	movb	%al, %ah
+	movb	%cl, %al	# and write it to CRT reg 0x35
+	call	outidx
+	call	inidx		# now read it back
+	andb	%ch, %al	# clear the upper 4 bits
+	jz	s3_2		# the first test failed. But we have a
+
+	movb	%bl, %ah	# second chance
+	movb	%cl, %al
+	call	outidx
+	jmp	s3_1		# do the other tests
+
+s3_2:	movw	%cx, %ax	# load ah with 0xf and al with 0x35
+	orb	%bl, %ah	# set the upper 4 bits of ah with the orig value
+	call	outidx		# write ...
+	call	inidx		# ... and reread 
+	andb	%cl, %al	# turn off the upper 4 bits
+	pushw	%ax
+	movb	%bl, %ah	# restore old value in register 0x35
+	movb	%cl, %al
+	call	outidx
+	popw	%ax
+	cmpb	%ch, %al	# setting lower 4 bits was successful => bad
+	je	no_s3		# writing is allowed => this is not an S3
+
+s3_1:	movw	$0x4838, %ax	# allow writing to special regs by putting
+	call	outidx		# magic number into CRT-register 0x38
+	movb	%cl, %al	# check whether we can write special reg 0x35
+	call	inidx
+	movb	%al, %bl
+	andb	$0xf0, %al
+	movb	%al, %ah
+	movb	%cl, %al
+	call	outidx
+	call	inidx
+	andb	%ch, %al
+	jnz	no_s3		# no, we can't write => no S3
+
+	movw	%cx, %ax
+	orb	%bl, %ah
+	call	outidx
+	call	inidx
+	andb	%ch, %al
+	pushw	%ax
+	movb	%bl, %ah	# restore old value in register 0x35
+	movb	%cl, %al
+	call	outidx
+	popw	%ax
+	cmpb	%ch, %al
+	jne	no_s31		# writing not possible => no S3
+	movb	$0x30, %al
+	call	inidx		# now get the S3 id ...
+	leaw	idS3, %di
+	movw	$0x10, %cx
+	repne
+	scasb
+	je	no_s31
+
+	movb	%bh, %ah
+	movb	$0x38, %al
+	jmp	s3rest
+
+no_s3:	movb	$0x35, %al	# restore CRT register 0x35
+	movb	%bl, %ah
+	call	outidx
+no_s31:	xorw	%bp, %bp	# Detection failed
+s3rest:	movb	%bh, %ah
+	movb	$0x38, %al	# restore old value of CRT register 0x38
+	jmp	outidx
+
+idS3:	.byte	0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95
+	.byte	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0
+
+s3_md:	.byte	0x54, 0x2b, 0x84
+	.byte	0x55, 0x19, 0x84
+	.byte	0
+	.ascii	"S3"
+	.byte	0
+
+# ATI cards.
+ati_test:
+	leaw 	idati, %si
+	movw	$0x31, %di
+	movw	$0x09, %cx
+	repe
+	cmpsb
+	je	atiok
+
+	xorw	%bp, %bp
+atiok:	ret
+
+idati:	.ascii	"761295520"
+
+ati_md:	.byte	0x23, 0x19, 0x84
+	.byte	0x33, 0x2c, 0x84
+	.byte	0x22, 0x1e, 0x64
+	.byte	0x21, 0x19, 0x64
+	.byte	0x58, 0x21, 0x50
+	.byte	0x5b, 0x1e, 0x50
+	.byte	0
+	.ascii	"ATI"
+	.byte	0
+
+# AHEAD
+ahead_test:
+	movw	$0x200f, %ax
+	movw	$0x3ce, %dx
+	outw	%ax, %dx
+	incw	%dx
+	inb	%dx, %al
+	cmpb	$0x20, %al
+	je	isahed
+
+	cmpb	$0x21, %al
+	je	isahed
+	
+	xorw	%bp, %bp
+isahed:	ret
+
+ahead_md:
+	.byte	0x22, 0x2c, 0x84
+	.byte	0x23, 0x19, 0x84
+	.byte	0x24, 0x1c, 0x84
+	.byte	0x2f, 0x32, 0xa0
+	.byte	0x32, 0x22, 0x50
+	.byte	0x34, 0x42, 0x50
+	.byte	0
+	.ascii	"Ahead"
+	.byte	0
+
+# Chips & Tech.
+chips_test:
+	movw	$0x3c3, %dx
+	inb	%dx, %al
+	orb	$0x10, %al
+	outb	%al, %dx
+	movw	$0x104, %dx
+	inb	%dx, %al
+	movb	%al, %bl
+	movw	$0x3c3, %dx
+	inb	%dx, %al
+	andb	$0xef, %al
+	outb	%al, %dx
+	cmpb	$0xa5, %bl
+	je	cantok
+	
+	xorw	%bp, %bp
+cantok:	ret
+
+chips_md:
+	.byte	0x60, 0x19, 0x84
+	.byte	0x61, 0x32, 0x84
+	.byte	0
+	.ascii	"Chips & Technologies"
+	.byte	0
+
+# Cirrus Logic 5X0
+cirrus1_test:
+	movw	$0x3d4, %dx
+	movb	$0x0c, %al
+	outb	%al, %dx
+	incw	%dx
+	inb	%dx, %al
+	movb	%al, %bl
+	xorb	%al, %al
+	outb	%al, %dx
+	decw	%dx
+	movb	$0x1f, %al
+	outb	%al, %dx
+	incw	%dx
+	inb	%dx, %al
+	movb	%al, %bh
+	xorb	%ah, %ah
+	shlb	$4, %al
+	movw	%ax, %cx
+	movb	%bh, %al
+	shrb	$4, %al
+	addw	%ax, %cx
+	shlw	$8, %cx
+	addw	$6, %cx
+	movw	%cx, %ax
+	movw	$0x3c4, %dx
+	outw	%ax, %dx
+	incw	%dx
+	inb	%dx, %al
+	andb	%al, %al
+	jnz	nocirr
+	
+	movb	%bh, %al
+	outb	%al, %dx
+	inb	%dx, %al
+	cmpb	$0x01, %al
+	je	iscirr
+
+nocirr:	xorw	%bp, %bp
+iscirr: movw	$0x3d4, %dx
+	movb	%bl, %al
+	xorb	%ah, %ah
+	shlw	$8, %ax
+	addw	$0x0c, %ax
+	outw	%ax, %dx
+	ret
+
+cirrus1_md:
+	.byte	0x1f, 0x19, 0x84
+	.byte	0x20, 0x2c, 0x84
+	.byte	0x22, 0x1e, 0x84
+	.byte	0x31, 0x25, 0x64
+	.byte	0
+	.ascii	"Cirrus Logic 5X0"
+	.byte	0
+
+# Cirrus Logic 54XX
+cirrus5_test:
+	movw	$0x3c4, %dx
+	movb	$6, %al
+	call	inidx
+	movb	%al, %bl			# BL=backup
+	movw	$6, %ax
+	call	tstidx
+	cmpb	$0x0f, %al
+	jne	c5fail
+	
+	movw	$0x1206, %ax
+	call	tstidx
+	cmpb	$0x12, %al
+	jne	c5fail
+	
+	movb	$0x1e, %al
+	call	inidx
+	movb	%al, %bh
+	movb	%bh, %ah
+	andb	$0xc0, %ah
+	movb	$0x1e, %al
+	call	tstidx
+	andb	$0x3f, %al
+	jne	c5xx
+	
+	movb	$0x1e, %al
+	movb	%bh, %ah
+	orb	$0x3f, %ah
+	call	tstidx
+	xorb	$0x3f, %al
+	andb	$0x3f, %al
+c5xx:	pushf
+	movb	$0x1e, %al
+	movb	%bh, %ah
+	outw	%ax, %dx
+	popf
+	je	c5done
+
+c5fail:	xorw	%bp, %bp
+c5done:	movb	$6, %al
+	movb	%bl, %ah
+	outw	%ax, %dx
+	ret
+
+cirrus5_md:
+	.byte	0x14, 0x19, 0x84
+	.byte	0x54, 0x2b, 0x84
+	.byte	0
+	.ascii	"Cirrus Logic 54XX"
+	.byte	0
+
+# Cirrus Logic 64XX -- no known extra modes, but must be identified, because
+# it's misidentified by the Ahead test.
+cirrus6_test:
+	movw	$0x3ce, %dx
+	movb	$0x0a, %al
+	call	inidx
+	movb	%al, %bl	# BL=backup
+	movw	$0xce0a, %ax
+	call	tstidx
+	orb	%al, %al
+	jne	c2fail
+	
+	movw	$0xec0a, %ax
+	call	tstidx
+	cmpb	$0x01, %al
+	jne	c2fail
+	
+	movb	$0xaa, %al
+	call	inidx		# 4X, 5X, 7X and 8X are valid 64XX chip ID's. 
+	shrb	$4, %al
+	subb	$4, %al
+	jz	c6done
+	
+	decb	%al
+	jz	c6done
+	
+	subb	$2, %al
+	jz	c6done
+	
+	decb	%al
+	jz	c6done
+	
+c2fail:	xorw	%bp, %bp
+c6done:	movb	$0x0a, %al
+	movb	%bl, %ah
+	outw	%ax, %dx
+	ret
+
+cirrus6_md:
+	.byte	0
+	.ascii	"Cirrus Logic 64XX"
+	.byte	0
+
+# Everex / Trident
+everex_test:
+	movw	$0x7000, %ax
+	xorw	%bx, %bx
+	int	$0x10
+	cmpb	$0x70, %al
+	jne	noevrx
+	
+	shrw	$4, %dx
+	cmpw	$0x678, %dx
+	je	evtrid
+	
+	cmpw	$0x236, %dx
+	jne	evrxok
+
+evtrid:	leaw	trident_md, %bp
+evrxok:	ret
+
+noevrx:	xorw	%bp, %bp
+	ret
+
+everex_md:
+	.byte	0x03, 0x22, 0x50
+	.byte	0x04, 0x3c, 0x50
+	.byte	0x07, 0x2b, 0x64
+	.byte	0x08, 0x4b, 0x64
+	.byte	0x0a, 0x19, 0x84
+	.byte	0x0b, 0x2c, 0x84
+	.byte	0x16, 0x1e, 0x50
+	.byte	0x18, 0x1b, 0x64
+	.byte	0x21, 0x40, 0xa0
+	.byte	0x40, 0x1e, 0x84
+	.byte	0
+	.ascii	"Everex/Trident"
+	.byte	0
+
+# Genoa.
+genoa_test:
+	leaw	idgenoa, %si			# Check Genoa 'clues'
+	xorw	%ax, %ax
+	movb	%es:(0x37), %al
+	movw	%ax, %di
+	movw	$0x04, %cx
+	decw	%si
+	decw	%di
+l1:	incw	%si
+	incw	%di
+	movb	(%si), %al
+	testb	%al, %al
+	jz	l2
+
+	cmpb	%es:(%di), %al
+l2:	loope 	l1
+	orw	%cx, %cx
+	je	isgen
+	
+	xorw	%bp, %bp
+isgen:	ret
+
+idgenoa: .byte	0x77, 0x00, 0x99, 0x66
+
+genoa_md:
+	.byte	0x58, 0x20, 0x50
+	.byte	0x5a, 0x2a, 0x64
+	.byte	0x60, 0x19, 0x84
+	.byte	0x61, 0x1d, 0x84
+	.byte	0x62, 0x20, 0x84
+	.byte	0x63, 0x2c, 0x84
+	.byte	0x64, 0x3c, 0x84
+	.byte	0x6b, 0x4f, 0x64
+	.byte	0x72, 0x3c, 0x50
+	.byte	0x74, 0x42, 0x50
+	.byte	0x78, 0x4b, 0x64
+	.byte	0
+	.ascii	"Genoa"
+	.byte	0
+
+# OAK
+oak_test:
+	leaw	idoakvga, %si
+	movw	$0x08, %di
+	movw	$0x08, %cx
+	repe
+	cmpsb
+	je	isoak
+	
+	xorw	%bp, %bp
+isoak:	ret
+
+idoakvga: .ascii  "OAK VGA "
+
+oak_md: .byte	0x4e, 0x3c, 0x50
+	.byte	0x4f, 0x3c, 0x84
+	.byte	0x50, 0x19, 0x84
+	.byte	0x51, 0x2b, 0x84
+	.byte	0
+	.ascii	"OAK"
+	.byte	0
+
+# WD Paradise.
+paradise_test:
+	leaw	idparadise, %si
+	movw	$0x7d, %di
+	movw	$0x04, %cx
+	repe
+	cmpsb
+	je	ispara
+	
+	xorw	%bp, %bp
+ispara:	ret
+
+idparadise:	.ascii	"VGA="
+
+paradise_md:
+	.byte	0x41, 0x22, 0x50
+	.byte	0x47, 0x1c, 0x84
+	.byte	0x55, 0x19, 0x84
+	.byte	0x54, 0x2c, 0x84
+	.byte	0
+	.ascii	"Paradise"
+	.byte	0
+
+# Trident.
+trident_test:
+	movw	$0x3c4, %dx
+	movb	$0x0e, %al
+	outb	%al, %dx
+	incw	%dx
+	inb	%dx, %al
+	xchgb	%al, %ah
+	xorb	%al, %al
+	outb	%al, %dx
+	inb	%dx, %al
+	xchgb	%ah, %al
+	movb	%al, %bl	# Strange thing ... in the book this wasn't
+	andb	$0x02, %bl	# necessary but it worked on my card which
+	jz	setb2		# is a trident. Without it the screen goes
+				# blurred ...
+	andb	$0xfd, %al
+	jmp	clrb2		
+
+setb2:	orb	$0x02, %al	
+clrb2:	outb	%al, %dx
+	andb	$0x0f, %ah
+	cmpb	$0x02, %ah
+	je	istrid
+
+	xorw	%bp, %bp
+istrid:	ret
+
+trident_md:
+	.byte	0x50, 0x1e, 0x50
+	.byte	0x51, 0x2b, 0x50
+	.byte	0x52, 0x3c, 0x50
+	.byte	0x57, 0x19, 0x84
+	.byte	0x58, 0x1e, 0x84
+	.byte	0x59, 0x2b, 0x84
+	.byte	0x5a, 0x3c, 0x84
+	.byte	0
+	.ascii	"Trident"
+	.byte	0
+
+# Tseng.
+tseng_test:
+	movw	$0x3cd, %dx
+	inb	%dx, %al	# Could things be this simple ! :-)
+	movb	%al, %bl
+	movb	$0x55, %al
+	outb	%al, %dx
+	inb	%dx, %al
+	movb	%al, %ah
+	movb	%bl, %al
+	outb	%al, %dx
+	cmpb	$0x55, %ah
+ 	je	istsen
+
+isnot:	xorw	%bp, %bp
+istsen:	ret
+
+tseng_md:
+	.byte	0x26, 0x3c, 0x50
+	.byte	0x2a, 0x28, 0x64
+	.byte	0x23, 0x19, 0x84
+	.byte	0x24, 0x1c, 0x84
+	.byte	0x22, 0x2c, 0x84
+	.byte	0x21, 0x3c, 0x84
+	.byte	0
+	.ascii	"Tseng"
+	.byte	0
+
+# Video7.
+video7_test:
+	movw	$0x3cc, %dx
+	inb	%dx, %al
+	movw	$0x3b4, %dx
+	andb	$0x01, %al
+	jz	even7
+
+	movw	$0x3d4, %dx
+even7:	movb	$0x0c, %al
+	outb	%al, %dx
+	incw	%dx
+	inb	%dx, %al
+	movb	%al, %bl
+	movb	$0x55, %al
+	outb	%al, %dx
+	inb	%dx, %al
+	decw	%dx
+	movb	$0x1f, %al
+	outb	%al, %dx
+	incw	%dx
+	inb	%dx, %al
+	movb	%al, %bh
+	decw	%dx
+	movb	$0x0c, %al
+	outb	%al, %dx
+	incw	%dx
+	movb	%bl, %al
+	outb	%al, %dx
+	movb	$0x55, %al
+	xorb	$0xea, %al
+	cmpb	%bh, %al
+	jne	isnot
+	
+	movb	$VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching
+	ret
+
+video7_md:
+	.byte	0x40, 0x2b, 0x50
+	.byte	0x43, 0x3c, 0x50
+	.byte	0x44, 0x3c, 0x64
+	.byte	0x41, 0x19, 0x84
+	.byte	0x42, 0x2c, 0x84
+	.byte	0x45, 0x1c, 0x84
+	.byte	0
+	.ascii	"Video 7"
+	.byte	0
+
+# Realtek VGA
+realtek_test:
+	leaw	idrtvga, %si
+	movw	$0x45, %di
+	movw	$0x0b, %cx
+	repe
+	cmpsb
+	je	isrt
+	
+	xorw	%bp, %bp
+isrt:	ret
+
+idrtvga:	.ascii	"REALTEK VGA"
+
+realtek_md:
+	.byte	0x1a, 0x3c, 0x50
+	.byte	0x1b, 0x19, 0x84
+	.byte	0x1c, 0x1e, 0x84
+	.byte	0x1d, 0x2b, 0x84
+	.byte	0x1e, 0x3c, 0x84
+	.byte	0
+	.ascii	"REALTEK"
+	.byte	0
+
+#endif	/* CONFIG_VIDEO_SVGA */
+
+# User-defined local mode table (VGA only)
+#ifdef CONFIG_VIDEO_LOCAL
+local_modes:
+	leaw	local_mode_table, %si
+locm1:	lodsw
+	orw	%ax, %ax
+	jz	locm2
+	
+	stosw
+	movsw
+	jmp	locm1
+
+locm2:	ret
+
+# This is the table of local video modes which can be supplied manually
+# by the user. Each entry consists of mode ID (word) and dimensions
+# (byte for column count and another byte for row count). These modes
+# are placed before all SVGA and VESA modes and override them if table
+# compacting is enabled. The table must end with a zero word followed
+# by NUL-terminated video adapter name.
+local_mode_table:
+	.word	0x0100				# Example: 40x25
+	.byte	25,40
+	.word	0
+	.ascii	"Local"
+	.byte	0
+#endif	/* CONFIG_VIDEO_LOCAL */
+
+# Read a key and return the ASCII code in al, scan code in ah
+getkey:	xorb	%ah, %ah
+	int	$0x16
+	ret
+
+# Read a key with a timeout of 30 seconds.
+# The hardware clock is used to get the time.
+getkt:	call	gettime
+	addb	$30, %al			# Wait 30 seconds
+	cmpb	$60, %al
+	jl	lminute
+
+	subb	$60, %al
+lminute:
+	movb	%al, %cl
+again:	movb	$0x01, %ah
+	int	$0x16
+	jnz	getkey				# key pressed, so get it
+
+	call	gettime
+	cmpb	%cl, %al
+	jne	again
+
+	movb	$0x20, %al			# timeout, return `space'
+	ret
+
+# Flush the keyboard buffer
+flush:	movb	$0x01, %ah
+	int	$0x16
+	jz	empty
+	
+	xorb	%ah, %ah
+	int	$0x16
+	jmp	flush
+
+empty:	ret
+
+# Print hexadecimal number.
+prthw:	pushw	%ax
+	movb	%ah, %al
+	call	prthb
+	popw	%ax
+prthb:	pushw	%ax
+	shrb	$4, %al
+	call	prthn
+	popw	%ax
+	andb	$0x0f, %al
+prthn:	cmpb	$0x0a, %al
+	jc	prth1
+
+	addb	$0x07, %al
+prth1:	addb	$0x30, %al
+	jmp	prtchr
+
+# Print decimal number in al
+prtdec:	pushw	%ax
+	pushw	%cx
+	xorb	%ah, %ah
+	movb	$0x0a, %cl
+	idivb	%cl
+	cmpb	$0x09, %al
+	jbe	lt100
+
+	call	prtdec
+	jmp	skip10
+
+lt100:	addb	$0x30, %al
+	call	prtchr
+skip10:	movb	%ah, %al
+	addb	$0x30, %al
+	call	prtchr	
+	popw	%cx
+	popw	%ax
+	ret
+
+store_edid:
+	pushw	%es				# just save all registers
+	pushw	%ax
+	pushw	%bx
+	pushw   %cx
+	pushw	%dx
+	pushw   %di
+
+	pushw	%fs
+	popw    %es
+
+	movl	$0x13131313, %eax		# memset block with 0x13
+	movw    $32, %cx
+	movw	$0x140, %di
+	cld
+	rep
+	stosl
+
+	movw	$0x4f15, %ax                    # do VBE/DDC
+	movw	$0x01, %bx
+	movw	$0x00, %cx
+	movw    $0x01, %dx
+	movw	$0x140, %di
+	int	$0x10
+
+	popw	%di				# restore all registers
+	popw	%dx
+	popw	%cx
+	popw	%bx
+	popw	%ax
+	popw	%es
+	ret
+
+# VIDEO_SELECT-only variables
+mt_end:		.word	0	# End of video mode table if built
+edit_buf:	.space	6	# Line editor buffer
+card_name:	.word	0	# Pointer to adapter name
+scanning:	.byte	0	# Performing mode scan
+do_restore:	.byte	0	# Screen contents altered during mode change
+svga_prefix:	.byte	VIDEO_FIRST_BIOS>>8	# Default prefix for BIOS modes
+graphic_mode:	.byte	0	# Graphic mode with a linear frame buffer
+dac_size:	.byte	6	# DAC bit depth
+
+# Status messages
+keymsg:		.ascii	"Press <RETURN> to see video modes available, "
+		.ascii	"<SPACE> to continue or wait 30 secs"
+		.byte	0x0d, 0x0a, 0
+
+listhdr:	.byte	0x0d, 0x0a
+		.ascii	"Mode:    COLSxROWS:"
+
+crlft:		.byte	0x0d, 0x0a, 0
+
+prompt:		.byte	0x0d, 0x0a
+		.asciz	"Enter mode number or `scan': "
+
+unknt:		.asciz	"Unknown mode ID. Try again."
+
+badmdt:		.ascii	"You passed an undefined mode number."
+		.byte	0x0d, 0x0a, 0
+
+vesaer:		.ascii	"Error: Scanning of VESA modes failed. Please "
+		.ascii	"report to <mj@ucw.cz>."
+		.byte	0x0d, 0x0a, 0
+
+old_name:	.asciz	"CGA/MDA/HGA"
+
+ega_name:	.asciz	"EGA"
+
+svga_name:	.ascii	" "
+
+vga_name:	.asciz	"VGA"
+
+vesa_name:	.asciz	"VESA"
+
+name_bann:	.asciz	"Video adapter: "
+#endif /* CONFIG_VIDEO_SELECT */
+
+# Other variables:
+adapter:	.byte	0	# Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA
+video_segment:	.word	0xb800	# Video memory segment
+force_size:	.word	0	# Use this size instead of the one in BIOS vars
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
new file mode 100644
index 000000000000..9ce51dee30b3
--- /dev/null
+++ b/arch/x86_64/defconfig
@@ -0,0 +1,1129 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.11-bk7
+# Sat Mar 12 23:43:44 2005
+#
+CONFIG_X86_64=y
+CONFIG_64BIT=y
+CONFIG_X86=y
+CONFIG_MMU=y
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_X86_CMPXCHG=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_HPET_TIMER=y
+CONFIG_HPET_EMULATE_RTC=y
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_GENERIC_IOMAP=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_CLEAN_COMPILE=y
+CONFIG_LOCK_KERNEL=y
+
+#
+# General setup
+#
+CONFIG_LOCALVERSION=""
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+# CONFIG_BSD_PROCESS_ACCT is not set
+CONFIG_SYSCTL=y
+# CONFIG_AUDIT is not set
+CONFIG_LOG_BUF_SHIFT=18
+# CONFIG_HOTPLUG is not set
+CONFIG_KOBJECT_UEVENT=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+# CONFIG_CPUSETS is not set
+# CONFIG_EMBEDDED is not set
+CONFIG_KALLSYMS=y
+CONFIG_KALLSYMS_ALL=y
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+CONFIG_SHMEM=y
+CONFIG_CC_ALIGN_FUNCTIONS=0
+CONFIG_CC_ALIGN_LABELS=0
+CONFIG_CC_ALIGN_LOOPS=0
+CONFIG_CC_ALIGN_JUMPS=0
+# CONFIG_TINY_SHMEM is not set
+CONFIG_BASE_SMALL=0
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_OBSOLETE_MODPARM=y
+# CONFIG_MODVERSIONS is not set
+# CONFIG_MODULE_SRCVERSION_ALL is not set
+# CONFIG_KMOD is not set
+CONFIG_STOP_MACHINE=y
+
+#
+# Processor type and features
+#
+# CONFIG_MK8 is not set
+# CONFIG_MPSC is not set
+CONFIG_GENERIC_CPU=y
+CONFIG_X86_L1_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_TSC=y
+CONFIG_X86_GOOD_APIC=y
+# CONFIG_MICROCODE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_X86_HT=y
+CONFIG_X86_IO_APIC=y
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_MTRR=y
+CONFIG_SMP=y
+# CONFIG_PREEMPT is not set
+CONFIG_SCHED_SMT=y
+CONFIG_K8_NUMA=y
+# CONFIG_NUMA_EMU is not set
+CONFIG_DISCONTIGMEM=y
+CONFIG_NUMA=y
+CONFIG_HAVE_DEC_LOCK=y
+CONFIG_NR_CPUS=8
+CONFIG_GART_IOMMU=y
+CONFIG_SWIOTLB=y
+CONFIG_X86_MCE=y
+CONFIG_X86_MCE_INTEL=y
+CONFIG_SECCOMP=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_IRQ_PROBE=y
+
+#
+# Power management options
+#
+CONFIG_PM=y
+# CONFIG_PM_DEBUG is not set
+CONFIG_SOFTWARE_SUSPEND=y
+CONFIG_PM_STD_PARTITION=""
+
+#
+# ACPI (Advanced Configuration and Power Interface) Support
+#
+CONFIG_ACPI=y
+CONFIG_ACPI_BOOT=y
+CONFIG_ACPI_INTERPRETER=y
+CONFIG_ACPI_SLEEP=y
+CONFIG_ACPI_SLEEP_PROC_FS=y
+CONFIG_ACPI_AC=y
+CONFIG_ACPI_BATTERY=y
+CONFIG_ACPI_BUTTON=y
+# CONFIG_ACPI_VIDEO is not set
+CONFIG_ACPI_FAN=y
+CONFIG_ACPI_PROCESSOR=y
+CONFIG_ACPI_THERMAL=y
+CONFIG_ACPI_NUMA=y
+# CONFIG_ACPI_ASUS is not set
+# CONFIG_ACPI_IBM is not set
+CONFIG_ACPI_TOSHIBA=y
+CONFIG_ACPI_BLACKLIST_YEAR=2001
+CONFIG_ACPI_DEBUG=y
+CONFIG_ACPI_BUS=y
+CONFIG_ACPI_EC=y
+CONFIG_ACPI_POWER=y
+CONFIG_ACPI_PCI=y
+CONFIG_ACPI_SYSTEM=y
+# CONFIG_ACPI_CONTAINER is not set
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_DEBUG is not set
+CONFIG_CPU_FREQ_STAT=y
+# CONFIG_CPU_FREQ_STAT_DETAILS is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_TABLE=y
+
+#
+# CPUFreq processor drivers
+#
+CONFIG_X86_POWERNOW_K8=y
+CONFIG_X86_POWERNOW_K8_ACPI=y
+# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
+CONFIG_X86_ACPI_CPUFREQ=y
+
+#
+# shared options
+#
+CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y
+
+#
+# Bus options (PCI etc.)
+#
+CONFIG_PCI=y
+CONFIG_PCI_DIRECT=y
+CONFIG_PCI_MMCONFIG=y
+CONFIG_UNORDERED_IO=y
+CONFIG_PCI_MSI=y
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_NAMES is not set
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+# CONFIG_PCCARD is not set
+
+#
+# PC-card bridges
+#
+
+#
+# PCI Hotplug Support
+#
+# CONFIG_HOTPLUG_PCI is not set
+
+#
+# Executable file formats / Emulations
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_BINFMT_MISC is not set
+CONFIG_IA32_EMULATION=y
+CONFIG_IA32_AOUT=y
+CONFIG_COMPAT=y
+CONFIG_SYSVIPC_COMPAT=y
+CONFIG_UID16=y
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+# CONFIG_FW_LOADER is not set
+# CONFIG_DEBUG_DRIVER is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+# CONFIG_PARPORT is not set
+
+#
+# Plug and Play support
+#
+# CONFIG_PNP is not set
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=y
+# CONFIG_BLK_DEV_CRYPTOLOOP is not set
+# CONFIG_BLK_DEV_NBD is not set
+# CONFIG_BLK_DEV_SX8 is not set
+# CONFIG_BLK_DEV_UB is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_LBD=y
+# CONFIG_CDROM_PKTCDVD is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_ATA_OVER_ETH is not set
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+# CONFIG_BLK_DEV_IDE_SATA is not set
+# CONFIG_BLK_DEV_HD_IDE is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+CONFIG_BLK_DEV_IDECD=y
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+# CONFIG_BLK_DEV_IDESCSI is not set
+# CONFIG_IDE_TASK_IOCTL is not set
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+# CONFIG_BLK_DEV_CMD640 is not set
+CONFIG_BLK_DEV_IDEPCI=y
+# CONFIG_IDEPCI_SHARE_IRQ is not set
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_GENERIC is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+# CONFIG_BLK_DEV_RZ1000 is not set
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+# CONFIG_BLK_DEV_AEC62XX is not set
+# CONFIG_BLK_DEV_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_BLK_DEV_ATIIXP is not set
+# CONFIG_BLK_DEV_CMD64X is not set
+# CONFIG_BLK_DEV_TRIFLEX is not set
+# CONFIG_BLK_DEV_CY82C693 is not set
+# CONFIG_BLK_DEV_CS5520 is not set
+# CONFIG_BLK_DEV_CS5530 is not set
+# CONFIG_BLK_DEV_HPT34X is not set
+# CONFIG_BLK_DEV_HPT366 is not set
+# CONFIG_BLK_DEV_SC1200 is not set
+CONFIG_BLK_DEV_PIIX=y
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_PDC202XX_OLD is not set
+# CONFIG_BLK_DEV_PDC202XX_NEW is not set
+# CONFIG_BLK_DEV_SVWKS is not set
+# CONFIG_BLK_DEV_SIIMAGE is not set
+# CONFIG_BLK_DEV_SIS5513 is not set
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+# CONFIG_BLK_DEV_VIA82CXXX is not set
+# CONFIG_IDE_ARM is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_SCSI=y
+# CONFIG_SCSI_PROC_FS is not set
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=y
+# CONFIG_CHR_DEV_ST is not set
+# CONFIG_CHR_DEV_OSST is not set
+# CONFIG_BLK_DEV_SR is not set
+# CONFIG_CHR_DEV_SG is not set
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+# CONFIG_SCSI_MULTI_LUN is not set
+# CONFIG_SCSI_CONSTANTS is not set
+# CONFIG_SCSI_LOGGING is not set
+
+#
+# SCSI Transport Attributes
+#
+# CONFIG_SCSI_SPI_ATTRS is not set
+# CONFIG_SCSI_FC_ATTRS is not set
+# CONFIG_SCSI_ISCSI_ATTRS is not set
+
+#
+# SCSI low-level drivers
+#
+CONFIG_BLK_DEV_3W_XXXX_RAID=y
+# CONFIG_SCSI_3W_9XXX is not set
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+# CONFIG_SCSI_AIC7XXX_OLD is not set
+CONFIG_SCSI_AIC79XX=y
+CONFIG_AIC79XX_CMDS_PER_DEVICE=32
+CONFIG_AIC79XX_RESET_DELAY_MS=4000
+# CONFIG_AIC79XX_ENABLE_RD_STRM is not set
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+CONFIG_SCSI_SATA=y
+# CONFIG_SCSI_SATA_AHCI is not set
+# CONFIG_SCSI_SATA_SVW is not set
+CONFIG_SCSI_ATA_PIIX=y
+# CONFIG_SCSI_SATA_NV is not set
+# CONFIG_SCSI_SATA_PROMISE is not set
+# CONFIG_SCSI_SATA_QSTOR is not set
+# CONFIG_SCSI_SATA_SX4 is not set
+# CONFIG_SCSI_SATA_SIL is not set
+# CONFIG_SCSI_SATA_SIS is not set
+# CONFIG_SCSI_SATA_ULI is not set
+CONFIG_SCSI_SATA_VIA=y
+# CONFIG_SCSI_SATA_VITESSE is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_EATA_PIO is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_IPS is not set
+# CONFIG_SCSI_INITIO is not set
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_SYM53C8XX_2 is not set
+# CONFIG_SCSI_IPR is not set
+# CONFIG_SCSI_QLOGIC_ISP is not set
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+CONFIG_SCSI_QLA2XXX=y
+# CONFIG_SCSI_QLA21XX is not set
+# CONFIG_SCSI_QLA22XX is not set
+# CONFIG_SCSI_QLA2300 is not set
+# CONFIG_SCSI_QLA2322 is not set
+# CONFIG_SCSI_QLA6312 is not set
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_DC390T is not set
+# CONFIG_SCSI_DEBUG is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+# CONFIG_MD is not set
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=y
+CONFIG_FUSION_MAX_SGE=40
+# CONFIG_FUSION_CTL is not set
+
+#
+# IEEE 1394 (FireWire) support
+#
+# CONFIG_IEEE1394 is not set
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Networking support
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+# CONFIG_PACKET_MMAP is not set
+# CONFIG_NETLINK_DEV is not set
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+# CONFIG_IP_ADVANCED_ROUTER is not set
+# CONFIG_IP_PNP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_IP_MROUTE is not set
+# CONFIG_ARPD is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_TUNNEL is not set
+CONFIG_IP_TCPDIAG=y
+CONFIG_IP_TCPDIAG_IPV6=y
+CONFIG_IPV6=y
+# CONFIG_IPV6_PRIVACY is not set
+# CONFIG_INET6_AH is not set
+# CONFIG_INET6_ESP is not set
+# CONFIG_INET6_IPCOMP is not set
+# CONFIG_INET6_TUNNEL is not set
+# CONFIG_IPV6_TUNNEL is not set
+# CONFIG_NETFILTER is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+# CONFIG_IP_SCTP is not set
+# CONFIG_ATM is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_DECNET is not set
+# CONFIG_LLC2 is not set
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+
+#
+# QoS and/or fair queueing
+#
+# CONFIG_NET_SCHED is not set
+# CONFIG_NET_CLS_ROUTE is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+CONFIG_NETPOLL=y
+# CONFIG_NETPOLL_RX is not set
+# CONFIG_NETPOLL_TRAP is not set
+CONFIG_NET_POLL_CONTROLLER=y
+# CONFIG_HAMRADIO is not set
+# CONFIG_IRDA is not set
+# CONFIG_BT is not set
+CONFIG_NETDEVICES=y
+# CONFIG_DUMMY is not set
+# CONFIG_BONDING is not set
+# CONFIG_EQUALIZER is not set
+# CONFIG_TUN is not set
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNGEM is not set
+# CONFIG_NET_VENDOR_3COM is not set
+
+#
+# Tulip family network device support
+#
+# CONFIG_NET_TULIP is not set
+# CONFIG_HP100 is not set
+CONFIG_NET_PCI=y
+# CONFIG_PCNET32 is not set
+CONFIG_AMD8111_ETH=y
+# CONFIG_AMD8111E_NAPI is not set
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_B44 is not set
+CONFIG_FORCEDETH=y
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+# CONFIG_E100 is not set
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+CONFIG_8139CP=m
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+# CONFIG_8139TOO_8129 is not set
+# CONFIG_8139_OLD_RX_RESET is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_VIA_RHINE is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+# CONFIG_ACENIC is not set
+# CONFIG_DL2K is not set
+CONFIG_E1000=y
+# CONFIG_E1000_NAPI is not set
+# CONFIG_NS83820 is not set
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+# CONFIG_R8169 is not set
+# CONFIG_SK98LIN is not set
+# CONFIG_VIA_VELOCITY is not set
+CONFIG_TIGON3=y
+
+#
+# Ethernet (10000 Mbit)
+#
+# CONFIG_IXGB is not set
+CONFIG_S2IO=m
+# CONFIG_S2IO_NAPI is not set
+# CONFIG_2BUFF_MODE is not set
+
+#
+# Token Ring devices
+#
+# CONFIG_TR is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PPP is not set
+# CONFIG_SLIP is not set
+# CONFIG_NET_FC is not set
+# CONFIG_SHAPER is not set
+CONFIG_NETCONSOLE=y
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+# CONFIG_INPUT_JOYDEV is not set
+# CONFIG_INPUT_TSDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+# CONFIG_MOUSE_SERIAL is not set
+# CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+# CONFIG_INPUT_MISC is not set
+
+#
+# Hardware I/O ports
+#
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_SERIO_CT82C710 is not set
+# CONFIG_SERIO_PCIPS2 is not set
+CONFIG_SERIO_LIBPS2=y
+# CONFIG_SERIO_RAW is not set
+# CONFIG_GAMEPORT is not set
+CONFIG_SOUND_GAMEPORT=y
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+# CONFIG_SERIAL_8250_ACPI is not set
+CONFIG_SERIAL_8250_NR_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+# CONFIG_WATCHDOG is not set
+CONFIG_HW_RANDOM=y
+# CONFIG_NVRAM is not set
+CONFIG_RTC=y
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+# CONFIG_DRM is not set
+# CONFIG_MWAVE is not set
+CONFIG_RAW_DRIVER=y
+CONFIG_HPET=y
+# CONFIG_HPET_RTC_IRQ is not set
+CONFIG_HPET_MMAP=y
+CONFIG_MAX_RAW_DEVS=256
+CONFIG_HANGCHECK_TIMER=y
+
+#
+# TPM devices
+#
+# CONFIG_TCG_TPM is not set
+
+#
+# I2C support
+#
+# CONFIG_I2C is not set
+
+#
+# Dallas's 1-wire bus
+#
+# CONFIG_W1 is not set
+
+#
+# Misc devices
+#
+# CONFIG_IBM_ASM is not set
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+
+#
+# Graphics support
+#
+# CONFIG_FB is not set
+CONFIG_VIDEO_SELECT=y
+
+#
+# Console display driver support
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_DUMMY_CONSOLE=y
+
+#
+# Sound
+#
+CONFIG_SOUND=y
+
+#
+# Advanced Linux Sound Architecture
+#
+# CONFIG_SND is not set
+
+#
+# Open Sound System
+#
+CONFIG_SOUND_PRIME=y
+# CONFIG_SOUND_BT878 is not set
+# CONFIG_SOUND_CMPCI is not set
+# CONFIG_SOUND_EMU10K1 is not set
+# CONFIG_SOUND_FUSION is not set
+# CONFIG_SOUND_CS4281 is not set
+# CONFIG_SOUND_ES1370 is not set
+# CONFIG_SOUND_ES1371 is not set
+# CONFIG_SOUND_ESSSOLO1 is not set
+# CONFIG_SOUND_MAESTRO is not set
+# CONFIG_SOUND_MAESTRO3 is not set
+CONFIG_SOUND_ICH=y
+# CONFIG_SOUND_SONICVIBES is not set
+# CONFIG_SOUND_TRIDENT is not set
+# CONFIG_SOUND_MSNDCLAS is not set
+# CONFIG_SOUND_MSNDPIN is not set
+# CONFIG_SOUND_VIA82CXXX is not set
+# CONFIG_SOUND_OSS is not set
+# CONFIG_SOUND_ALI5455 is not set
+# CONFIG_SOUND_FORTE is not set
+# CONFIG_SOUND_RME96XX is not set
+# CONFIG_SOUND_AD1980 is not set
+
+#
+# USB support
+#
+CONFIG_USB=y
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_OTG is not set
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB_ARCH_HAS_OHCI=y
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_SPLIT_ISO is not set
+# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
+CONFIG_USB_OHCI_HCD=y
+# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+CONFIG_USB_OHCI_LITTLE_ENDIAN=y
+CONFIG_USB_UHCI_HCD=y
+# CONFIG_USB_SL811_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_USB_AUDIO is not set
+# CONFIG_USB_BLUETOOTH_TTY is not set
+# CONFIG_USB_MIDI is not set
+# CONFIG_USB_ACM is not set
+CONFIG_USB_PRINTER=y
+
+#
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information
+#
+CONFIG_USB_STORAGE=y
+# CONFIG_USB_STORAGE_DEBUG is not set
+# CONFIG_USB_STORAGE_RW_DETECT is not set
+# CONFIG_USB_STORAGE_DATAFAB is not set
+# CONFIG_USB_STORAGE_FREECOM is not set
+# CONFIG_USB_STORAGE_ISD200 is not set
+# CONFIG_USB_STORAGE_DPCM is not set
+# CONFIG_USB_STORAGE_USBAT is not set
+# CONFIG_USB_STORAGE_SDDR09 is not set
+# CONFIG_USB_STORAGE_SDDR55 is not set
+# CONFIG_USB_STORAGE_JUMPSHOT is not set
+
+#
+# USB Input Devices
+#
+CONFIG_USB_HID=y
+CONFIG_USB_HIDINPUT=y
+# CONFIG_HID_FF is not set
+# CONFIG_USB_HIDDEV is not set
+# CONFIG_USB_AIPTEK is not set
+# CONFIG_USB_WACOM is not set
+# CONFIG_USB_KBTAB is not set
+# CONFIG_USB_POWERMATE is not set
+# CONFIG_USB_MTOUCH is not set
+# CONFIG_USB_EGALAX is not set
+# CONFIG_USB_XPAD is not set
+# CONFIG_USB_ATI_REMOTE is not set
+
+#
+# USB Imaging devices
+#
+# CONFIG_USB_MDC800 is not set
+# CONFIG_USB_MICROTEK is not set
+
+#
+# USB Multimedia devices
+#
+# CONFIG_USB_DABUSB is not set
+
+#
+# Video4Linux support is needed for USB Multimedia device support
+#
+
+#
+# USB Network Adapters
+#
+# CONFIG_USB_CATC is not set
+# CONFIG_USB_KAWETH is not set
+# CONFIG_USB_PEGASUS is not set
+# CONFIG_USB_RTL8150 is not set
+# CONFIG_USB_USBNET is not set
+CONFIG_USB_MON=y
+
+#
+# USB port drivers
+#
+
+#
+# USB Serial Converter support
+#
+# CONFIG_USB_SERIAL is not set
+
+#
+# USB Miscellaneous drivers
+#
+# CONFIG_USB_EMI62 is not set
+# CONFIG_USB_EMI26 is not set
+# CONFIG_USB_AUERSWALD is not set
+# CONFIG_USB_RIO500 is not set
+# CONFIG_USB_LEGOTOWER is not set
+# CONFIG_USB_LCD is not set
+# CONFIG_USB_LED is not set
+# CONFIG_USB_CYTHERM is not set
+# CONFIG_USB_PHIDGETKIT is not set
+# CONFIG_USB_PHIDGETSERVO is not set
+# CONFIG_USB_IDMOUSE is not set
+# CONFIG_USB_SISUSBVGA is not set
+# CONFIG_USB_TEST is not set
+
+#
+# USB ATM/DSL drivers
+#
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# MMC/SD Card support
+#
+# CONFIG_MMC is not set
+
+#
+# InfiniBand support
+#
+# CONFIG_INFINIBAND is not set
+
+#
+# Firmware Drivers
+#
+# CONFIG_EDD is not set
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+# CONFIG_EXT2_FS_SECURITY is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+# CONFIG_EXT3_FS_SECURITY is not set
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+# CONFIG_REISERFS_PROC_INFO is not set
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+# CONFIG_REISERFS_FS_SECURITY is not set
+# CONFIG_JFS_FS is not set
+CONFIG_FS_POSIX_ACL=y
+
+#
+# XFS support
+#
+# CONFIG_XFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_ROMFS_FS is not set
+# CONFIG_QUOTA is not set
+CONFIG_DNOTIFY=y
+CONFIG_AUTOFS_FS=y
+# CONFIG_AUTOFS4_FS is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+# CONFIG_JOLIET is not set
+# CONFIG_ZISOFS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_SYSFS=y
+# CONFIG_DEVFS_FS is not set
+# CONFIG_DEVPTS_FS_XATTR is not set
+CONFIG_TMPFS=y
+# CONFIG_TMPFS_XATTR is not set
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_HFSPLUS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V4 is not set
+# CONFIG_NFS_DIRECTIO is not set
+CONFIG_NFSD=y
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_V4 is not set
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=y
+CONFIG_SUNRPC=y
+# CONFIG_RPCSEC_GSS_KRB5 is not set
+# CONFIG_RPCSEC_GSS_SPKM3 is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_CIFS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=y
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+# CONFIG_NLS_CODEPAGE_850 is not set
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+# CONFIG_NLS_CODEPAGE_932 is not set
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+CONFIG_NLS_ISO8859_15=y
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+CONFIG_NLS_UTF8=y
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_KERNEL=y
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_PRINTK_TIME is not set
+# CONFIG_SCHEDSTATS is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_KOBJECT is not set
+# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_FS=y
+CONFIG_INIT_DEBUG=y
+# CONFIG_IOMMU_DEBUG is not set
+CONFIG_KPROBES=y
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITY is not set
+
+#
+# Cryptographic options
+#
+# CONFIG_CRYPTO is not set
+
+#
+# Hardware crypto devices
+#
+
+#
+# Library routines
+#
+# CONFIG_CRC_CCITT is not set
+CONFIG_CRC32=y
+# CONFIG_LIBCRC32C is not set
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
new file mode 100644
index 000000000000..a12b19da4b59
--- /dev/null
+++ b/arch/x86_64/ia32/Makefile
@@ -0,0 +1,32 @@
+#
+# Makefile for the ia32 kernel emulation subsystem.
+#
+
+obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \
+	ia32_signal.o tls32.o \
+	ia32_binfmt.o fpu32.o ptrace32.o syscall32.o
+
+sysv-$(CONFIG_SYSVIPC) := ipc32.o
+obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
+
+obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
+
+$(obj)/syscall32.o: $(src)/syscall32.c \
+	$(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
+
+# Teach kbuild about targets
+targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
+
+# The DSO images are built using a special linker script
+quiet_cmd_syscall = SYSCALL $@
+      cmd_syscall = $(CC) -m32 -nostdlib -shared -s \
+			   -Wl,-soname=linux-gate.so.1 -o $@ \
+			   -Wl,-T,$(filter-out FORCE,$^)
+
+$(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
+$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
+	$(call if_changed,syscall)
+
+AFLAGS_vsyscall-sysenter.o = -m32
+AFLAGS_vsyscall-syscall.o = -m32
+CFLAGS_ia32_ioctl.o += -Ifs/
diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c
new file mode 100644
index 000000000000..1c23095f1813
--- /dev/null
+++ b/arch/x86_64/ia32/fpu32.c
@@ -0,0 +1,184 @@
+/* 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
+ * This is used for ptrace, signals and coredumps in 32bit emulation.
+ * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $
+ */ 
+
+#include <linux/sched.h>
+#include <asm/sigcontext32.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+ 
+	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
+        tmp = ~twd;
+        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+        /* and move the valid bits to the lower byte. */
+        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+        return tmp;
+}
+
+static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
+{
+	struct _fpxreg *st = NULL;
+	unsigned long tos = (fxsave->swd >> 11) & 7;
+	unsigned long twd = (unsigned long) fxsave->twd;
+	unsigned long tag;
+	unsigned long ret = 0xffff0000;
+	int i;
+
+#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
+
+	for (i = 0 ; i < 8 ; i++) {
+		if (twd & 0x1) {
+			st = FPREG_ADDR( fxsave, (i - tos) & 7 );
+
+			switch (st->exponent & 0x7fff) {
+			case 0x7fff:
+				tag = 2;		/* Special */
+				break;
+			case 0x0000:
+				if ( !st->significand[0] &&
+				     !st->significand[1] &&
+				     !st->significand[2] &&
+				     !st->significand[3] ) {
+					tag = 1;	/* Zero */
+				} else {
+					tag = 2;	/* Special */
+				}
+				break;
+			default:
+				if (st->significand[3] & 0x8000) {
+					tag = 0;	/* Valid */
+				} else {
+					tag = 2;	/* Special */
+				}
+				break;
+			}
+		} else {
+			tag = 3;			/* Empty */
+		}
+		ret |= (tag << (2 * i));
+		twd = twd >> 1;
+	}
+	return ret;
+}
+
+
+static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
+					 struct _fpstate_ia32 __user *buf)
+{
+	struct _fpxreg *to;
+	struct _fpreg __user *from;
+	int i;
+	u32 v;
+	int err = 0;
+
+#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf)
+	G(0, fxsave->cwd);
+	G(1, fxsave->swd);
+	G(2, fxsave->twd);
+	fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
+	G(3, fxsave->rip);
+	G(4, v);
+	fxsave->fop = v>>16;	/* cs ignored */
+	G(5, fxsave->rdp);
+	/* 6: ds ignored */
+#undef G
+	if (err) 
+		return -1; 
+
+	to = (struct _fpxreg *)&fxsave->st_space[0];
+	from = &buf->_st[0];
+	for (i = 0 ; i < 8 ; i++, to++, from++) {
+		if (__copy_from_user(to, from, sizeof(*from)))
+			return -1;
+	}
+	return 0;
+}
+
+
+static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
+				       struct i387_fxsave_struct *fxsave,
+				       struct pt_regs *regs,
+				       struct task_struct *tsk)
+{
+	struct _fpreg __user *to;
+	struct _fpxreg *from;
+	int i;
+	u16 cs,ds; 
+	int err = 0; 
+
+	if (tsk == current) {
+		/* should be actually ds/cs at fpu exception time,
+		   but that information is not available in 64bit mode. */
+		asm("movw %%ds,%0 " : "=r" (ds)); 
+		asm("movw %%cs,%0 " : "=r" (cs)); 		
+	} else { /* ptrace. task has stopped. */
+		ds = tsk->thread.ds;
+		cs = regs->cs;
+	} 
+
+#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf)
+	P(0, (u32)fxsave->cwd | 0xffff0000);
+	P(1, (u32)fxsave->swd | 0xffff0000);
+	P(2, twd_fxsr_to_i387(fxsave));
+	P(3, (u32)fxsave->rip);
+	P(4,  cs | ((u32)fxsave->fop) << 16); 
+	P(5, fxsave->rdp);
+	P(6, 0xffff0000 | ds);
+#undef P
+
+	if (err) 
+		return -1; 
+
+	to = &buf->_st[0];
+	from = (struct _fpxreg *) &fxsave->st_space[0];
+	for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+		if (__copy_to_user(to, from, sizeof(*to)))
+			return -1;
+	}
+	return 0;
+}
+
+int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) 
+{ 
+	clear_fpu(tsk);
+	if (!fsave) { 
+		if (__copy_from_user(&tsk->thread.i387.fxsave, 
+				     &buf->_fxsr_env[0],
+				     sizeof(struct i387_fxsave_struct)))
+			return -1;
+		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+		set_stopped_child_used_math(tsk);
+	} 
+	return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
+}  
+
+int save_i387_ia32(struct task_struct *tsk, 
+		   struct _fpstate_ia32 __user *buf, 
+		   struct pt_regs *regs,
+		   int fsave)
+{
+	int err = 0;
+
+	init_fpu(tsk);
+	if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
+		return -1;
+	if (fsave)
+		return 0;
+	err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+	if (fsave) 
+		return err ? -1 : 1; 	
+	err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
+	err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+			      sizeof(struct i387_fxsave_struct));
+	return err ? -1 : 1;
+}
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
new file mode 100644
index 000000000000..1965efc974dc
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -0,0 +1,529 @@
+/*
+ *  a.out loader for x86-64
+ *
+ *  Copyright (C) 1991, 1992, 1996  Linus Torvalds
+ *  Hacked together by Andi Kleen
+ */
+
+#include <linux/module.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/a.out.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/slab.h>
+#include <linux/binfmts.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/user32.h>
+#include <asm/ia32.h>
+
+#undef WARN_OLD
+#undef CORE_DUMP /* probably broken */
+
+extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
+				unsigned long stack_top, int exec_stack);
+
+static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_library(struct file*);
+
+#if CORE_DUMP
+static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
+
+/*
+ * fill in the user structure for a core dump..
+ */
+static void dump_thread32(struct pt_regs * regs, struct user32 * dump)
+{
+	u32 fs,gs;
+
+/* changed the size calculations - should hopefully work better. lbt */
+	dump->magic = CMAGIC;
+	dump->start_code = 0;
+	dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1);
+	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+	dump->u_dsize -= dump->u_tsize;
+	dump->u_ssize = 0;
+	dump->u_debugreg[0] = current->thread.debugreg0;  
+	dump->u_debugreg[1] = current->thread.debugreg1;  
+	dump->u_debugreg[2] = current->thread.debugreg2;  
+	dump->u_debugreg[3] = current->thread.debugreg3;  
+	dump->u_debugreg[4] = 0;  
+	dump->u_debugreg[5] = 0;  
+	dump->u_debugreg[6] = current->thread.debugreg6;  
+	dump->u_debugreg[7] = current->thread.debugreg7;  
+
+	if (dump->start_stack < 0xc0000000)
+		dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT;
+
+	dump->regs.ebx = regs->rbx;
+	dump->regs.ecx = regs->rcx;
+	dump->regs.edx = regs->rdx;
+	dump->regs.esi = regs->rsi;
+	dump->regs.edi = regs->rdi;
+	dump->regs.ebp = regs->rbp;
+	dump->regs.eax = regs->rax;
+	dump->regs.ds = current->thread.ds;
+	dump->regs.es = current->thread.es;
+	asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
+	asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 
+	dump->regs.orig_eax = regs->orig_rax;
+	dump->regs.eip = regs->rip;
+	dump->regs.cs = regs->cs;
+	dump->regs.eflags = regs->eflags;
+	dump->regs.esp = regs->rsp;
+	dump->regs.ss = regs->ss;
+
+#if 1 /* FIXME */
+	dump->u_fpvalid = 0;
+#else
+	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+#endif
+}
+
+#endif
+
+static struct linux_binfmt aout_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_aout_binary,
+	.load_shlib	= load_aout_library,
+#if CORE_DUMP
+	.core_dump	= aout_core_dump,
+#endif
+	.min_coredump	= PAGE_SIZE
+};
+
+static void set_brk(unsigned long start, unsigned long end)
+{
+	start = PAGE_ALIGN(start);
+	end = PAGE_ALIGN(end);
+	if (end <= start)
+		return;
+	down_write(&current->mm->mmap_sem);
+	do_brk(start, end - start);
+	up_write(&current->mm->mmap_sem);
+}
+
+#if CORE_DUMP
+/*
+ * These are the only things you should do on a core-file: use only these
+ * macros to write out all the necessary info.
+ */
+
+static int dump_write(struct file *file, const void *addr, int nr)
+{
+	return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+
+#define DUMP_WRITE(addr, nr)	\
+	if (!dump_write(file, (void *)(addr), (nr))) \
+		goto end_coredump;
+
+#define DUMP_SEEK(offset) \
+if (file->f_op->llseek) { \
+	if (file->f_op->llseek(file,(offset),0) != (offset)) \
+ 		goto end_coredump; \
+} else file->f_pos = (offset)
+
+/*
+ * Routine writes a core dump image in the current directory.
+ * Currently only a stub-function.
+ *
+ * Note that setuid/setgid files won't make a core-dump if the uid/gid
+ * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
+ * field, which also makes sure the core-dumps won't be recursive if the
+ * dumping of the process results in another error..
+ */
+
+static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
+{
+	mm_segment_t fs;
+	int has_dumped = 0;
+	unsigned long dump_start, dump_size;
+	struct user32 dump;
+#       define START_DATA(u)	(u.u_tsize << PAGE_SHIFT)
+#       define START_STACK(u)   (u.start_stack)
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	has_dumped = 1;
+	current->flags |= PF_DUMPCORE;
+       	strncpy(dump.u_comm, current->comm, sizeof(current->comm));
+	dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump)));
+	dump.signal = signr;
+	dump_thread32(regs, &dump);
+
+/* If the size of the dump file exceeds the rlimit, then see what would happen
+   if we wrote the stack, but not the data area.  */
+	if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
+		dump.u_dsize = 0;
+
+/* Make sure we have enough room to write the stack and data areas. */
+	if ((dump.u_ssize+1) * PAGE_SIZE >
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
+		dump.u_ssize = 0;
+
+/* make sure we actually have a data and stack area to dump */
+	set_fs(USER_DS);
+	if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+		dump.u_dsize = 0;
+	if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+		dump.u_ssize = 0;
+
+	set_fs(KERNEL_DS);
+/* struct user */
+	DUMP_WRITE(&dump,sizeof(dump));
+/* Now dump all of the user data.  Include malloced stuff as well */
+	DUMP_SEEK(PAGE_SIZE);
+/* now we start writing out the user space info */
+	set_fs(USER_DS);
+/* Dump the data area */
+	if (dump.u_dsize != 0) {
+		dump_start = START_DATA(dump);
+		dump_size = dump.u_dsize << PAGE_SHIFT;
+		DUMP_WRITE(dump_start,dump_size);
+	}
+/* Now prepare to dump the stack area */
+	if (dump.u_ssize != 0) {
+		dump_start = START_STACK(dump);
+		dump_size = dump.u_ssize << PAGE_SHIFT;
+		DUMP_WRITE(dump_start,dump_size);
+	}
+/* Finally dump the task struct.  Not be used by gdb, but could be useful */
+	set_fs(KERNEL_DS);
+	DUMP_WRITE(current,sizeof(*current));
+end_coredump:
+	set_fs(fs);
+	return has_dumped;
+}
+#endif
+
+/*
+ * create_aout_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
+{
+	u32 __user *argv;
+	u32 __user *envp;
+	u32 __user *sp;
+	int argc = bprm->argc;
+	int envc = bprm->envc;
+
+	sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
+	sp -= envc+1;
+	envp = sp;
+	sp -= argc+1;
+	argv = sp;
+	put_user((unsigned long) envp,--sp);
+	put_user((unsigned long) argv,--sp);
+	put_user(argc,--sp);
+	current->mm->arg_start = (unsigned long) p;
+	while (argc-->0) {
+		char c;
+		put_user((u32)(unsigned long)p,argv++);
+		do {
+			get_user(c,p++);
+		} while (c);
+	}
+	put_user(NULL,argv);
+	current->mm->arg_end = current->mm->env_start = (unsigned long) p;
+	while (envc-->0) {
+		char c;
+		put_user((u32)(unsigned long)p,envp++);
+		do {
+			get_user(c,p++);
+		} while (c);
+	}
+	put_user(NULL,envp);
+	current->mm->env_end = (unsigned long) p;
+	return sp;
+}
+
+/*
+ * These are the functions used to load a.out style executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+
+static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+{
+	struct exec ex;
+	unsigned long error;
+	unsigned long fd_offset;
+	unsigned long rlim;
+	int retval;
+
+	ex = *((struct exec *) bprm->buf);		/* exec-header */
+	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
+	     N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
+	    N_TRSIZE(ex) || N_DRSIZE(ex) ||
+	    i_size_read(bprm->file->f_dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+		return -ENOEXEC;
+	}
+
+	fd_offset = N_TXTOFF(ex);
+
+	/* Check initial limits. This avoids letting people circumvent
+	 * size limits imposed on them by creating programs with large
+	 * arrays in the data or bss.
+	 */
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+	if (rlim >= RLIM_INFINITY)
+		rlim = ~0;
+	if (ex.a_data + ex.a_bss > rlim)
+		return -ENOMEM;
+
+	/* Flush all traces of the currently running executable */
+	retval = flush_old_exec(bprm);
+	if (retval)
+		return retval;
+
+	regs->cs = __USER32_CS; 
+	regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
+		regs->r13 = regs->r14 = regs->r15 = 0;
+
+	/* OK, This is the point of no return */
+	set_personality(PER_LINUX);
+	set_thread_flag(TIF_IA32); 
+	clear_thread_flag(TIF_ABI_PENDING);
+
+	current->mm->end_code = ex.a_text +
+		(current->mm->start_code = N_TXTADDR(ex));
+	current->mm->end_data = ex.a_data +
+		(current->mm->start_data = N_DATADDR(ex));
+	current->mm->brk = ex.a_bss +
+		(current->mm->start_brk = N_BSSADDR(ex));
+	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
+
+	set_mm_counter(current->mm, rss, 0);
+	current->mm->mmap = NULL;
+	compute_creds(bprm);
+ 	current->flags &= ~PF_FORKNOEXEC;
+
+	if (N_MAGIC(ex) == OMAGIC) {
+		unsigned long text_addr, map_size;
+		loff_t pos;
+
+		text_addr = N_TXTADDR(ex);
+
+		pos = 32;
+		map_size = ex.a_text+ex.a_data;
+
+		down_write(&current->mm->mmap_sem);
+		error = do_brk(text_addr & PAGE_MASK, map_size);
+		up_write(&current->mm->mmap_sem);
+
+		if (error != (text_addr & PAGE_MASK)) {
+			send_sig(SIGKILL, current, 0);
+			return error;
+		}
+
+		error = bprm->file->f_op->read(bprm->file, (char *)text_addr,
+			  ex.a_text+ex.a_data, &pos);
+		if ((signed long)error < 0) {
+			send_sig(SIGKILL, current, 0);
+			return error;
+		}
+			 
+		flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
+	} else {
+#ifdef WARN_OLD
+		static unsigned long error_time, error_time2;
+		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
+		    (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
+		{
+			printk(KERN_NOTICE "executable not page aligned\n");
+			error_time2 = jiffies;
+		}
+
+		if ((fd_offset & ~PAGE_MASK) != 0 &&
+		    (jiffies-error_time) > 5*HZ)
+		{
+			printk(KERN_WARNING 
+			       "fd_offset is not page aligned. Please convert program: %s\n",
+			       bprm->file->f_dentry->d_name.name);
+			error_time = jiffies;
+		}
+#endif
+
+		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
+			loff_t pos = fd_offset;
+			down_write(&current->mm->mmap_sem);
+			do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+			up_write(&current->mm->mmap_sem);
+			bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex),
+					ex.a_text+ex.a_data, &pos);
+			flush_icache_range((unsigned long) N_TXTADDR(ex),
+					   (unsigned long) N_TXTADDR(ex) +
+					   ex.a_text+ex.a_data);
+			goto beyond_if;
+		}
+
+		down_write(&current->mm->mmap_sem);
+		error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
+			PROT_READ | PROT_EXEC,
+			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
+			fd_offset);
+		up_write(&current->mm->mmap_sem);
+
+		if (error != N_TXTADDR(ex)) {
+			send_sig(SIGKILL, current, 0);
+			return error;
+		}
+
+		down_write(&current->mm->mmap_sem);
+ 		error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+				PROT_READ | PROT_WRITE | PROT_EXEC,
+				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
+				fd_offset + ex.a_text);
+		up_write(&current->mm->mmap_sem);
+		if (error != N_DATADDR(ex)) {
+			send_sig(SIGKILL, current, 0);
+			return error;
+		}
+	}
+beyond_if:
+	set_binfmt(&aout_format);
+
+	set_brk(current->mm->start_brk, current->mm->brk);
+
+	retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+	if (retval < 0) { 
+		/* Someone check-me: is this error path enough? */ 
+		send_sig(SIGKILL, current, 0); 
+		return retval;
+	}
+
+	current->mm->start_stack =
+		(unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
+	/* start thread */
+	asm volatile("movl %0,%%fs" :: "r" (0)); \
+	asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
+	load_gs_index(0); 
+	(regs)->rip = ex.a_entry;
+	(regs)->rsp = current->mm->start_stack;
+	(regs)->eflags = 0x200;
+	(regs)->cs = __USER32_CS;
+	(regs)->ss = __USER32_DS;
+	set_fs(USER_DS);
+	if (unlikely(current->ptrace & PT_PTRACED)) {
+		if (current->ptrace & PT_TRACE_EXEC)
+			ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
+		else
+			send_sig(SIGTRAP, current, 0);
+	}
+	return 0;
+}
+
+static int load_aout_library(struct file *file)
+{
+	struct inode * inode;
+	unsigned long bss, start_addr, len;
+	unsigned long error;
+	int retval;
+	struct exec ex;
+
+	inode = file->f_dentry->d_inode;
+
+	retval = -ENOEXEC;
+	error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
+	if (error != sizeof(ex))
+		goto out;
+
+	/* We come in here for the regular a.out style of shared libraries */
+	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
+	    N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
+	    i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+		goto out;
+	}
+
+	if (N_FLAGS(ex))
+		goto out;
+
+	/* For  QMAGIC, the starting address is 0x20 into the page.  We mask
+	   this off to get the starting address for the page */
+
+	start_addr =  ex.a_entry & 0xfffff000;
+
+	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
+		loff_t pos = N_TXTOFF(ex);
+
+#ifdef WARN_OLD
+		static unsigned long error_time;
+		if ((jiffies-error_time) > 5*HZ)
+		{
+			printk(KERN_WARNING 
+			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
+			       file->f_dentry->d_name.name);
+			error_time = jiffies;
+		}
+#endif
+		down_write(&current->mm->mmap_sem);
+		do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
+		up_write(&current->mm->mmap_sem);
+		
+		file->f_op->read(file, (char *)start_addr,
+			ex.a_text + ex.a_data, &pos);
+		flush_icache_range((unsigned long) start_addr,
+				   (unsigned long) start_addr + ex.a_text + ex.a_data);
+
+		retval = 0;
+		goto out;
+	}
+	/* Now use mmap to map the library into memory. */
+	down_write(&current->mm->mmap_sem);
+	error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
+			PROT_READ | PROT_WRITE | PROT_EXEC,
+			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
+			N_TXTOFF(ex));
+	up_write(&current->mm->mmap_sem);
+	retval = error;
+	if (error != start_addr)
+		goto out;
+
+	len = PAGE_ALIGN(ex.a_text + ex.a_data);
+	bss = ex.a_text + ex.a_data + ex.a_bss;
+	if (bss > len) {
+		down_write(&current->mm->mmap_sem);
+		error = do_brk(start_addr + len, bss - len);
+		up_write(&current->mm->mmap_sem);
+		retval = error;
+		if (error != start_addr + len)
+			goto out;
+	}
+	retval = 0;
+out:
+	return retval;
+}
+
+static int __init init_aout_binfmt(void)
+{
+	return register_binfmt(&aout_format);
+}
+
+static void __exit exit_aout_binfmt(void)
+{
+	unregister_binfmt(&aout_format);
+}
+
+module_init(init_aout_binfmt);
+module_exit(exit_aout_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
new file mode 100644
index 000000000000..93d568dfa762
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -0,0 +1,434 @@
+/* 
+ * Written 2000,2002 by Andi Kleen. 
+ * 
+ * Loosely based on the sparc64 and IA64 32bit emulation loaders.
+ * This tricks binfmt_elf.c into loading 32bit binaries using lots 
+ * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
+ */ 
+#include <linux/types.h>
+#include <linux/config.h> 
+#include <linux/stddef.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/string.h>
+#include <linux/binfmts.h>
+#include <linux/mm.h>
+#include <linux/security.h>
+
+#include <asm/segment.h> 
+#include <asm/ptrace.h>
+#include <asm/processor.h>
+#include <asm/user32.h>
+#include <asm/sigcontext32.h>
+#include <asm/fpu32.h>
+#include <asm/i387.h>
+#include <asm/uaccess.h>
+#include <asm/ia32.h>
+#include <asm/vsyscall32.h>
+
+#define ELF_NAME "elf/i386"
+
+#define AT_SYSINFO 32
+#define AT_SYSINFO_EHDR		33
+
+int sysctl_vsyscall32 = 1;
+
+#define ARCH_DLINFO do {  \
+	if (sysctl_vsyscall32) { \
+	NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
+	NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE);    \
+	}	\
+} while(0)
+
+struct file;
+struct elf_phdr; 
+
+#define IA32_EMULATOR 1
+
+#define ELF_ET_DYN_BASE		(TASK_UNMAPPED_32 + 0x1000000)
+
+#undef ELF_ARCH
+#define ELF_ARCH EM_386
+
+#undef ELF_CLASS
+#define ELF_CLASS ELFCLASS32
+
+#define ELF_DATA	ELFDATA2LSB
+
+#define USE_ELF_CORE_DUMP 1
+
+/* Overwrite elfcore.h */ 
+#define _LINUX_ELFCORE_H 1
+typedef unsigned int elf_greg_t;
+
+#define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+/*
+ * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out
+ * extra segments containing the vsyscall DSO contents.  Dumping its
+ * contents makes post-mortem fully interpretable later without matching up
+ * the same kernel and hardware config to see what PC values meant.
+ * Dumping its extra ELF program headers includes all the other information
+ * a debugger needs to easily find how the vsyscall DSO was being used.
+ */
+#define ELF_CORE_EXTRA_PHDRS		(VSYSCALL32_EHDR->e_phnum)
+#define ELF_CORE_WRITE_EXTRA_PHDRS					      \
+do {									      \
+	const struct elf32_phdr *const vsyscall_phdrs =			      \
+		(const struct elf32_phdr *) (VSYSCALL32_BASE		      \
+					   + VSYSCALL32_EHDR->e_phoff);	      \
+	int i;								      \
+	Elf32_Off ofs = 0;						      \
+	for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) {		      \
+		struct elf32_phdr phdr = vsyscall_phdrs[i];		      \
+		if (phdr.p_type == PT_LOAD) {				      \
+			BUG_ON(ofs != 0);				      \
+			ofs = phdr.p_offset = offset;			      \
+			phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz);	      \
+			phdr.p_filesz = phdr.p_memsz;			      \
+			offset += phdr.p_filesz;			      \
+		}							      \
+		else							      \
+			phdr.p_offset += ofs;				      \
+		phdr.p_paddr = 0; /* match other core phdrs */		      \
+		DUMP_WRITE(&phdr, sizeof(phdr));			      \
+	}								      \
+} while (0)
+#define ELF_CORE_WRITE_EXTRA_DATA					      \
+do {									      \
+	const struct elf32_phdr *const vsyscall_phdrs =			      \
+		(const struct elf32_phdr *) (VSYSCALL32_BASE		      \
+					   + VSYSCALL32_EHDR->e_phoff);	      \
+	int i;								      \
+	for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) {		      \
+		if (vsyscall_phdrs[i].p_type == PT_LOAD)		      \
+			DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr,	      \
+				   PAGE_ALIGN(vsyscall_phdrs[i].p_memsz));    \
+	}								      \
+} while (0)
+
+struct elf_siginfo
+{
+	int	si_signo;			/* signal number */
+	int	si_code;			/* extra code */
+	int	si_errno;			/* errno */
+};
+
+#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0)
+
+struct elf_prstatus
+{
+	struct elf_siginfo pr_info;	/* Info associated with signal */
+	short	pr_cursig;		/* Current signal */
+	unsigned int pr_sigpend;	/* Set of pending signals */
+	unsigned int pr_sighold;	/* Set of held signals */
+	pid_t	pr_pid;
+	pid_t	pr_ppid;
+	pid_t	pr_pgrp;
+	pid_t	pr_sid;
+	struct compat_timeval pr_utime;	/* User time */
+	struct compat_timeval pr_stime;	/* System time */
+	struct compat_timeval pr_cutime;	/* Cumulative user time */
+	struct compat_timeval pr_cstime;	/* Cumulative system time */
+	elf_gregset_t pr_reg;	/* GP registers */
+	int pr_fpvalid;		/* True if math co-processor being used.  */
+};
+
+#define ELF_PRARGSZ	(80)	/* Number of chars for args */
+
+struct elf_prpsinfo
+{
+	char	pr_state;	/* numeric process state */
+	char	pr_sname;	/* char for pr_state */
+	char	pr_zomb;	/* zombie */
+	char	pr_nice;	/* nice val */
+	unsigned int pr_flag;	/* flags */
+	__u16	pr_uid;
+	__u16	pr_gid;
+	pid_t	pr_pid, pr_ppid, pr_pgrp, pr_sid;
+	/* Lots missing */
+	char	pr_fname[16];	/* filename of executable */
+	char	pr_psargs[ELF_PRARGSZ];	/* initial part of arg list */
+};
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
+#define _GET_SEG(x) \
+	({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; })
+
+/* Assumes current==process to be dumped */
+#define ELF_CORE_COPY_REGS(pr_reg, regs)       		\
+	pr_reg[0] = regs->rbx;				\
+	pr_reg[1] = regs->rcx;				\
+	pr_reg[2] = regs->rdx;				\
+	pr_reg[3] = regs->rsi;				\
+	pr_reg[4] = regs->rdi;				\
+	pr_reg[5] = regs->rbp;				\
+	pr_reg[6] = regs->rax;				\
+	pr_reg[7] = _GET_SEG(ds);   			\
+	pr_reg[8] = _GET_SEG(es);			\
+	pr_reg[9] = _GET_SEG(fs);			\
+	pr_reg[10] = _GET_SEG(gs);			\
+	pr_reg[11] = regs->orig_rax;			\
+	pr_reg[12] = regs->rip;				\
+	pr_reg[13] = regs->cs;				\
+	pr_reg[14] = regs->eflags;			\
+	pr_reg[15] = regs->rsp;				\
+	pr_reg[16] = regs->ss;
+
+#define user user32
+
+#define __ASM_X86_64_ELF_H 1
+#define elf_read_implies_exec(ex, have_pt_gnu_stack)	(!(have_pt_gnu_stack))
+//#include <asm/ia32.h>
+#include <linux/elf.h>
+
+typedef struct user_i387_ia32_struct elf_fpregset_t;
+typedef struct user32_fxsr_struct elf_fpxregset_t;
+
+
+static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
+{
+	ELF_CORE_COPY_REGS((*elfregs), regs)
+}
+
+static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
+{	
+	struct pt_regs *pp = (struct pt_regs *)(t->thread.rsp0);
+	--pp;
+	ELF_CORE_COPY_REGS((*elfregs), pp);
+	/* fix wrong segments */ 
+	(*elfregs)[7] = t->thread.ds; 
+	(*elfregs)[9] = t->thread.fsindex; 
+	(*elfregs)[10] = t->thread.gsindex; 
+	(*elfregs)[8] = t->thread.es; 	
+	return 1; 
+}
+
+static inline int 
+elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu)
+{
+	struct _fpstate_ia32 *fpstate = (void*)fpu; 
+	mm_segment_t oldfs = get_fs();
+
+	if (!tsk_used_math(tsk))
+		return 0;
+	if (!regs)
+		regs = (struct pt_regs *)tsk->thread.rsp0;
+	--regs;
+	if (tsk == current)
+		unlazy_fpu(tsk);
+	set_fs(KERNEL_DS); 
+	save_i387_ia32(tsk, fpstate, regs, 1);
+	/* Correct for i386 bug. It puts the fop into the upper 16bits of 
+	   the tag word (like FXSAVE), not into the fcs*/ 
+	fpstate->cssel |= fpstate->tag & 0xffff0000; 
+	set_fs(oldfs); 
+	return 1; 
+}
+
+#define ELF_CORE_COPY_XFPREGS 1
+static inline int 
+elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
+{
+	struct pt_regs *regs = ((struct pt_regs *)(t->thread.rsp0))-1; 
+	if (!tsk_used_math(t))
+		return 0;
+	if (t == current)
+		unlazy_fpu(t); 
+	memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t));
+	xfpu->fcs = regs->cs; 
+	xfpu->fos = t->thread.ds; /* right? */ 
+	return 1;
+}
+
+#undef elf_check_arch
+#define elf_check_arch(x) \
+	((x)->e_machine == EM_386)
+
+extern int force_personality32;
+
+#define ELF_EXEC_PAGESIZE PAGE_SIZE
+#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
+#define ELF_PLATFORM  ("i686")
+#define SET_PERSONALITY(ex, ibcs2)			\
+do {							\
+	unsigned long new_flags = 0;				\
+	if ((ex).e_ident[EI_CLASS] == ELFCLASS32)		\
+		new_flags = _TIF_IA32;				\
+	if ((current_thread_info()->flags & _TIF_IA32)		\
+	    != new_flags)					\
+		set_thread_flag(TIF_ABI_PENDING);		\
+	else							\
+		clear_thread_flag(TIF_ABI_PENDING);		\
+	/* XXX This overwrites the user set personality */	\
+	current->personality |= force_personality32;		\
+} while (0)
+
+/* Override some function names */
+#define elf_format			elf32_format
+
+#define init_elf_binfmt			init_elf32_binfmt
+#define exit_elf_binfmt			exit_elf32_binfmt
+
+#define load_elf_binary load_elf32_binary
+
+#define ELF_PLAT_INIT(r, load_addr)	elf32_init(r)
+#define setup_arg_pages(bprm, stack_top, exec_stack) \
+	ia32_setup_arg_pages(bprm, stack_top, exec_stack)
+int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack);
+
+#undef start_thread
+#define start_thread(regs,new_rip,new_rsp) do { \
+	asm volatile("movl %0,%%fs" :: "r" (0)); \
+	asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
+	load_gs_index(0); \
+	(regs)->rip = (new_rip); \
+	(regs)->rsp = (new_rsp); \
+	(regs)->eflags = 0x200; \
+	(regs)->cs = __USER32_CS; \
+	(regs)->ss = __USER32_DS; \
+	set_fs(USER_DS); \
+} while(0) 
+
+
+#define elf_map elf32_map
+
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); 
+MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
+
+#undef MODULE_DESCRIPTION
+#undef MODULE_AUTHOR
+
+#define elf_addr_t __u32
+
+#undef TASK_SIZE
+#define TASK_SIZE 0xffffffff
+
+static void elf32_init(struct pt_regs *);
+
+#include "../../../fs/binfmt_elf.c" 
+
+static void elf32_init(struct pt_regs *regs)
+{
+	struct task_struct *me = current; 
+	regs->rdi = 0;
+	regs->rsi = 0;
+	regs->rdx = 0;
+	regs->rcx = 0;
+	regs->rax = 0;
+	regs->rbx = 0; 
+	regs->rbp = 0; 
+	regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
+		regs->r13 = regs->r14 = regs->r15 = 0; 
+    me->thread.fs = 0; 
+	me->thread.gs = 0;
+	me->thread.fsindex = 0; 
+	me->thread.gsindex = 0;
+    me->thread.ds = __USER_DS; 
+	me->thread.es = __USER_DS;
+}
+
+int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack)
+{
+	unsigned long stack_base;
+	struct vm_area_struct *mpnt;
+	struct mm_struct *mm = current->mm;
+	int i, ret;
+
+	stack_base = IA32_STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE;
+	mm->arg_start = bprm->p + stack_base;
+
+	bprm->p += stack_base;
+	if (bprm->loader)
+		bprm->loader += stack_base;
+	bprm->exec += stack_base;
+
+	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!mpnt) 
+		return -ENOMEM; 
+	
+	if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
+		kmem_cache_free(vm_area_cachep, mpnt);
+		return -ENOMEM;
+	}
+
+	memset(mpnt, 0, sizeof(*mpnt));
+
+	down_write(&mm->mmap_sem);
+	{
+		mpnt->vm_mm = mm;
+		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
+		mpnt->vm_end = IA32_STACK_TOP;
+		if (executable_stack == EXSTACK_ENABLE_X)
+			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
+		else if (executable_stack == EXSTACK_DISABLE_X)
+			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
+		else
+			mpnt->vm_flags = VM_STACK_FLAGS;
+ 		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? 
+ 			PAGE_COPY_EXEC : PAGE_COPY;
+		if ((ret = insert_vm_struct(mm, mpnt))) {
+			up_write(&mm->mmap_sem);
+			kmem_cache_free(vm_area_cachep, mpnt);
+			return ret;
+		}
+		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
+	} 
+
+	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
+		struct page *page = bprm->page[i];
+		if (page) {
+			bprm->page[i] = NULL;
+			install_arg_page(mpnt, page, stack_base);
+		}
+		stack_base += PAGE_SIZE;
+	}
+	up_write(&mm->mmap_sem);
+	
+	return 0;
+}
+
+static unsigned long
+elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
+{
+	unsigned long map_addr;
+	struct task_struct *me = current; 
+
+	down_write(&me->mm->mmap_sem);
+	map_addr = do_mmap(filep, ELF_PAGESTART(addr),
+			   eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot, 
+			   type,
+			   eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr));
+	up_write(&me->mm->mmap_sem);
+	return(map_addr);
+}
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static ctl_table abi_table2[] = {
+	{ 99, "vsyscall32", &sysctl_vsyscall32, sizeof(int), 0644, NULL,
+	  proc_dointvec },
+	{ 0, }
+}; 
+
+static ctl_table abi_root_table2[] = { 
+	{ .ctl_name = CTL_ABI, .procname = "abi", .mode = 0555, 
+	  .child = abi_table2 }, 
+	{ 0 }, 
+}; 
+
+static __init int ia32_binfmt_init(void)
+{ 
+	register_sysctl_table(abi_root_table2, 1);
+	return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
new file mode 100644
index 000000000000..d259f8a6f811
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_ioctl.c
@@ -0,0 +1,201 @@
+/* $Id: ia32_ioctl.c,v 1.25 2002/10/11 07:17:06 ak Exp $
+ * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
+ *
+ * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
+ * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
+ * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * ioctls.
+ */
+
+#define INCLUDES
+#include <linux/syscalls.h>
+#include "compat_ioctl.c"
+#include <asm/mtrr.h>
+#include <asm/ia32.h>
+
+#define CODE
+#include "compat_ioctl.c"
+  
+#ifndef TIOCGDEV
+#define TIOCGDEV       _IOR('T',0x32, unsigned int)
+#endif
+static int tiocgdev(unsigned fd, unsigned cmd,  unsigned int __user *ptr) 
+{ 
+
+	struct file *file = fget(fd);
+	struct tty_struct *real_tty;
+
+	if (!file)
+		return -EBADF;
+	if (file->f_op->ioctl != tty_ioctl)
+		return -EINVAL; 
+	real_tty = (struct tty_struct *)file->private_data;
+	if (!real_tty) 	
+		return -EINVAL; 
+	return put_user(new_encode_dev(tty_devnum(real_tty)), ptr); 
+} 
+
+#define RTC_IRQP_READ32	_IOR('p', 0x0b, unsigned int)	 /* Read IRQ rate   */
+#define RTC_IRQP_SET32	_IOW('p', 0x0c, unsigned int)	 /* Set IRQ rate    */
+#define RTC_EPOCH_READ32	_IOR('p', 0x0d, unsigned)	 /* Read epoch      */
+#define RTC_EPOCH_SET32		_IOW('p', 0x0e, unsigned)	 /* Set epoch       */
+
+static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg) 
+{ 
+	unsigned long val;
+	mm_segment_t oldfs = get_fs(); 
+	int ret; 
+	
+	switch (cmd) { 
+	case RTC_IRQP_READ32: 
+		set_fs(KERNEL_DS); 
+		ret = sys_ioctl(fd, RTC_IRQP_READ, (unsigned long)&val); 
+		set_fs(oldfs); 
+		if (!ret)
+			ret = put_user(val, (unsigned int __user *) arg); 
+		return ret; 
+
+	case RTC_IRQP_SET32: 
+		cmd = RTC_IRQP_SET; 
+		break; 
+
+	case RTC_EPOCH_READ32:
+		set_fs(KERNEL_DS); 
+		ret = sys_ioctl(fd, RTC_EPOCH_READ, (unsigned long) &val); 
+		set_fs(oldfs); 
+		if (!ret)
+			ret = put_user(val, (unsigned int __user *) arg); 
+		return ret; 
+
+	case RTC_EPOCH_SET32:
+		cmd = RTC_EPOCH_SET; 
+		break; 
+	} 
+	return sys_ioctl(fd,cmd,arg); 
+} 
+
+/* /proc/mtrr ioctls */
+
+
+struct mtrr_sentry32
+{
+    compat_ulong_t base;    /*  Base address     */
+    compat_uint_t size;    /*  Size of region   */
+    compat_uint_t type;     /*  Type of region   */
+};
+
+struct mtrr_gentry32
+{
+    compat_ulong_t regnum;   /*  Register number  */
+    compat_uint_t base;    /*  Base address     */
+    compat_uint_t size;    /*  Size of region   */
+    compat_uint_t type;     /*  Type of region   */
+};
+
+#define	MTRR_IOCTL_BASE	'M'
+
+#define MTRRIOC32_ADD_ENTRY        _IOW(MTRR_IOCTL_BASE,  0, struct mtrr_sentry32)
+#define MTRRIOC32_SET_ENTRY        _IOW(MTRR_IOCTL_BASE,  1, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_ENTRY        _IOW(MTRR_IOCTL_BASE,  2, struct mtrr_sentry32)
+#define MTRRIOC32_GET_ENTRY        _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_ENTRY       _IOW(MTRR_IOCTL_BASE,  4, struct mtrr_sentry32)
+#define MTRRIOC32_ADD_PAGE_ENTRY   _IOW(MTRR_IOCTL_BASE,  5, struct mtrr_sentry32)
+#define MTRRIOC32_SET_PAGE_ENTRY   _IOW(MTRR_IOCTL_BASE,  6, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_PAGE_ENTRY   _IOW(MTRR_IOCTL_BASE,  7, struct mtrr_sentry32)
+#define MTRRIOC32_GET_PAGE_ENTRY   _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_PAGE_ENTRY  _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry32)
+
+
+static int mtrr_ioctl32(unsigned int fd, unsigned int cmd, unsigned long arg)
+{ 
+	struct mtrr_gentry g;
+	struct mtrr_sentry s;
+	int get = 0, err = 0; 
+	struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)arg; 
+	mm_segment_t oldfs = get_fs(); 
+
+	switch (cmd) { 
+#define SET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; break 
+#define GET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; get=1; break
+		SET(ADD);
+		SET(SET); 
+		SET(DEL);
+		GET(GET); 
+		SET(KILL);
+		SET(ADD_PAGE); 
+		SET(SET_PAGE); 
+		SET(DEL_PAGE); 
+		GET(GET_PAGE); 
+		SET(KILL_PAGE); 
+	} 
+	
+	if (get) { 
+		err = get_user(g.regnum, &g32->regnum);
+		err |= get_user(g.base, &g32->base);
+		err |= get_user(g.size, &g32->size);
+		err |= get_user(g.type, &g32->type); 
+
+		arg = (unsigned long)&g; 
+	} else { 
+		struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)arg;
+		err = get_user(s.base, &s32->base);
+		err |= get_user(s.size, &s32->size);
+		err |= get_user(s.type, &s32->type);
+
+		arg = (unsigned long)&s; 
+	} 
+	if (err) return err;
+	
+	set_fs(KERNEL_DS); 
+	err = sys_ioctl(fd, cmd, arg); 
+	set_fs(oldfs); 
+		
+	if (!err && get) { 
+		err = put_user(g.base, &g32->base);
+		err |= put_user(g.size, &g32->size);
+		err |= put_user(g.regnum, &g32->regnum);
+		err |= put_user(g.type, &g32->type); 
+	} 
+	return err;
+} 
+
+#define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) }, 
+#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
+
+struct ioctl_trans ioctl_start[] = { 
+#include <linux/compat_ioctl.h>
+#define DECLARES
+#include "compat_ioctl.c"
+COMPATIBLE_IOCTL(HDIO_SET_KEEPSETTINGS)
+COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
+COMPATIBLE_IOCTL(BLKRASET)
+COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
+COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
+COMPATIBLE_IOCTL(FIOQSIZE)
+
+/* And these ioctls need translation */
+HANDLE_IOCTL(TIOCGDEV, tiocgdev)
+/* realtime device */
+HANDLE_IOCTL(RTC_IRQP_READ,  rtc32_ioctl)
+HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl)
+HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl)
+HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl)
+HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl)
+/* take care of sizeof(sizeof()) breakage */
+/* mtrr */
+HANDLE_IOCTL(MTRRIOC32_ADD_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_SET_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_DEL_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_GET_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_KILL_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_ADD_PAGE_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_SET_PAGE_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_DEL_PAGE_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_GET_PAGE_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_KILL_PAGE_ENTRY, mtrr_ioctl32)
+}; 
+
+int ioctl_table_size = ARRAY_SIZE(ioctl_start);
+
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
new file mode 100644
index 000000000000..fbd09b5126ce
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -0,0 +1,621 @@
+/*
+ *  linux/arch/x86_64/ia32/ia32_signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ *  2000-12-*   x86-64 compatibility mode signal handling by Andi Kleen
+ * 
+ *  $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compat.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/ia32.h>
+#include <asm/ptrace.h>
+#include <asm/ia32_unistd.h>
+#include <asm/user32.h>
+#include <asm/sigcontext32.h>
+#include <asm/fpu32.h>
+#include <asm/proto.h>
+#include <asm/vsyscall32.h>
+
+#define DEBUG_SIG 0
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+{
+	int err;
+	if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	/* If you change siginfo_t structure, please make sure that
+	   this code is fixed accordingly.
+	   It should never copy any pad contained in the structure
+	   to avoid security leaks, but must copy the generic
+	   3 ints plus the relevant union member.  */
+	err = __put_user(from->si_signo, &to->si_signo);
+	err |= __put_user(from->si_errno, &to->si_errno);
+ 	err |= __put_user((short)from->si_code, &to->si_code);
+
+	if (from->si_code < 0) {
+		err |= __put_user(from->si_pid, &to->si_pid);
+ 		err |= __put_user(from->si_uid, &to->si_uid);
+ 		err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
+	} else {
+ 		/* First 32bits of unions are always present:
+ 		 * si_pid === si_band === si_tid === si_addr(LS half) */
+		err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]);
+		switch (from->si_code >> 16) {
+		case __SI_FAULT >> 16:
+			break;
+		case __SI_CHLD >> 16:
+			err |= __put_user(from->si_utime, &to->si_utime);
+			err |= __put_user(from->si_stime, &to->si_stime);
+			err |= __put_user(from->si_status, &to->si_status);
+			/* FALL THROUGH */
+		default:
+		case __SI_KILL >> 16:
+			err |= __put_user(from->si_uid, &to->si_uid);
+			break;
+		case __SI_POLL >> 16:
+			err |= __put_user(from->si_fd, &to->si_fd); 
+			break;
+		case __SI_TIMER >> 16:
+			err |= __put_user(from->si_overrun, &to->si_overrun); 
+			err |= __put_user(ptr_to_compat(from->si_ptr),
+					&to->si_ptr);
+			break;
+		case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ >> 16:
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
+		}
+	}
+	return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+	int err;
+	u32 ptr32;
+	if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	err = __get_user(to->si_signo, &from->si_signo);
+	err |= __get_user(to->si_errno, &from->si_errno);
+	err |= __get_user(to->si_code, &from->si_code);
+
+	err |= __get_user(to->si_pid, &from->si_pid);
+	err |= __get_user(to->si_uid, &from->si_uid);
+	err |= __get_user(ptr32, &from->si_ptr);
+	to->si_ptr = compat_ptr(ptr32);
+
+	return err;
+}
+
+asmlinkage long
+sys32_sigsuspend(int history0, int history1, old_sigset_t mask,
+		 struct pt_regs *regs)
+{
+	sigset_t saveset;
+
+	mask &= _BLOCKABLE;
+	spin_lock_irq(&current->sighand->siglock);
+	saveset = current->blocked;
+	siginitset(&current->blocked, mask);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	regs->rax = -EINTR;
+	while (1) {
+		current->state = TASK_INTERRUPTIBLE;
+		schedule();
+		if (do_signal(regs, &saveset))
+			return -EINTR;
+	}
+}
+
+asmlinkage long
+sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
+		  stack_ia32_t __user *uoss_ptr, 
+		  struct pt_regs *regs)
+{
+	stack_t uss,uoss; 
+	int ret;
+	mm_segment_t seg; 
+	if (uss_ptr) { 
+		u32 ptr;
+		memset(&uss,0,sizeof(stack_t));
+		if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) ||
+			    __get_user(ptr, &uss_ptr->ss_sp) ||
+			    __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
+			    __get_user(uss.ss_size, &uss_ptr->ss_size))
+			return -EFAULT;
+		uss.ss_sp = compat_ptr(ptr);
+	}
+	seg = get_fs(); 
+	set_fs(KERNEL_DS); 
+	ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp);
+	set_fs(seg); 
+	if (ret >= 0 && uoss_ptr)  {
+		if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) ||
+		    __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
+		    __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
+		    __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+			ret = -EFAULT;
+	} 	
+	return ret;	
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+struct sigframe
+{
+	u32 pretcode;
+	int sig;
+	struct sigcontext_ia32 sc;
+	struct _fpstate_ia32 fpstate;
+	unsigned int extramask[_COMPAT_NSIG_WORDS-1];
+	char retcode[8];
+};
+
+struct rt_sigframe
+{
+	u32 pretcode;
+	int sig;
+	u32 pinfo;
+	u32 puc;
+	compat_siginfo_t info;
+	struct ucontext_ia32 uc;
+	struct _fpstate_ia32 fpstate;
+	char retcode[8];
+};
+
+static int
+ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax)
+{
+	unsigned int err = 0;
+	
+	/* Always make any pending restarted system calls return -EINTR */
+	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+#if DEBUG_SIG
+	printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
+		sc, sc->err, sc->eip, sc->cs, sc->eflags);
+#endif
+#define COPY(x)		{ \
+	unsigned int reg;			\
+	err |= __get_user(reg, &sc->e ##x);	\
+	regs->r ## x = reg;			\
+}
+
+#define RELOAD_SEG(seg,mask)						\
+	{ unsigned int cur; 						\
+	  unsigned short pre;						\
+	  err |= __get_user(pre, &sc->seg);				\
+    	  asm volatile("movl %%" #seg ",%0" : "=r" (cur));		\
+	  pre |= mask; 							\
+	  if (pre != cur) loadsegment(seg,pre); }
+
+	/* Reload fs and gs if they have changed in the signal handler.
+	   This does not handle long fs/gs base changes in the handler, but 
+	   does not clobber them at least in the normal case. */ 
+	
+	{
+		unsigned gs, oldgs; 
+		err |= __get_user(gs, &sc->gs);
+		gs |= 3; 
+		asm("movl %%gs,%0" : "=r" (oldgs));
+		if (gs != oldgs)
+		load_gs_index(gs); 
+	} 
+	RELOAD_SEG(fs,3);
+	RELOAD_SEG(ds,3);
+	RELOAD_SEG(es,3);
+
+	COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+	COPY(dx); COPY(cx); COPY(ip);
+	/* Don't touch extended registers */ 
+	
+	err |= __get_user(regs->cs, &sc->cs); 
+	regs->cs |= 3;  
+	err |= __get_user(regs->ss, &sc->ss); 
+	regs->ss |= 3; 
+
+	{
+		unsigned int tmpflags;
+		err |= __get_user(tmpflags, &sc->eflags);
+		regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+		regs->orig_rax = -1;		/* disable syscall checks */
+	}
+
+	{
+		u32 tmp;
+		struct _fpstate_ia32 __user * buf;
+		err |= __get_user(tmp, &sc->fpstate);
+		buf = compat_ptr(tmp);
+		if (buf) {
+			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+				goto badframe;
+			err |= restore_i387_ia32(current, buf, 0);
+		} else {
+			struct task_struct *me = current;
+			if (used_math()) {
+				clear_fpu(me);
+				clear_used_math();
+			}
+		}
+	}
+
+	{ 
+		u32 tmp;
+		err |= __get_user(tmp, &sc->eax);
+		*peax = tmp;
+	}
+	return err;
+
+badframe:
+	return 1;
+}
+
+asmlinkage long sys32_sigreturn(struct pt_regs *regs)
+{
+	struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8);
+	sigset_t set;
+	unsigned int eax;
+
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+		goto badframe;
+	if (__get_user(set.sig[0], &frame->sc.oldmask)
+	    || (_COMPAT_NSIG_WORDS > 1
+		&& __copy_from_user((((char *) &set.sig) + 4), &frame->extramask,
+				    sizeof(frame->extramask))))
+		goto badframe;
+
+	sigdelsetmask(&set, ~_BLOCKABLE);
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = set;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	
+	if (ia32_restore_sigcontext(regs, &frame->sc, &eax))
+		goto badframe;
+	return eax;
+
+badframe:
+	signal_fault(regs, frame, "32bit sigreturn");
+	return 0;
+}	
+
+asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
+{
+	struct rt_sigframe __user *frame;
+	sigset_t set;
+	unsigned int eax;
+	struct pt_regs tregs;
+
+	frame = (struct rt_sigframe __user *)(regs->rsp - 4);
+
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+		goto badframe;
+	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+		goto badframe;
+
+	sigdelsetmask(&set, ~_BLOCKABLE);
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = set;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	
+	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+		goto badframe;
+
+	tregs = *regs;
+	if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
+		goto badframe;
+
+	return eax;
+
+badframe:
+	signal_fault(regs,frame,"32bit rt sigreturn");
+	return 0;
+}	
+
+/*
+ * Set up a signal frame.
+ */
+
+static int
+ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate,
+		 struct pt_regs *regs, unsigned int mask)
+{
+	int tmp, err = 0;
+	u32 eflags;
+
+	tmp = 0;
+	__asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+	__asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+	err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+	__asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
+	err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
+	__asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
+	err |= __put_user(tmp, (unsigned int __user *)&sc->es);
+
+	err |= __put_user((u32)regs->rdi, &sc->edi);
+	err |= __put_user((u32)regs->rsi, &sc->esi);
+	err |= __put_user((u32)regs->rbp, &sc->ebp);
+	err |= __put_user((u32)regs->rsp, &sc->esp);
+	err |= __put_user((u32)regs->rbx, &sc->ebx);
+	err |= __put_user((u32)regs->rdx, &sc->edx);
+	err |= __put_user((u32)regs->rcx, &sc->ecx);
+	err |= __put_user((u32)regs->rax, &sc->eax);
+	err |= __put_user((u32)regs->cs, &sc->cs);
+	err |= __put_user((u32)regs->ss, &sc->ss);
+	err |= __put_user(current->thread.trap_no, &sc->trapno);
+	err |= __put_user(current->thread.error_code, &sc->err);
+	err |= __put_user((u32)regs->rip, &sc->eip);
+	eflags = regs->eflags;
+	if (current->ptrace & PT_PTRACED)
+		eflags &= ~TF_MASK;
+	err |= __put_user((u32)eflags, &sc->eflags);
+	err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
+
+	tmp = save_i387_ia32(current, fpstate, regs, 0);
+	if (tmp < 0)
+		err = -EFAULT;
+	else { 
+		clear_used_math();
+		stts();
+		err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
+					&sc->fpstate);
+	}
+
+	/* non-iBCS2 extensions.. */
+	err |= __put_user(mask, &sc->oldmask);
+	err |= __put_user(current->thread.cr2, &sc->cr2);
+
+	return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+static void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+{
+	unsigned long rsp;
+
+	/* Default to using normal stack */
+	rsp = regs->rsp;
+
+	/* This is the X/Open sanctioned signal stack switching.  */
+	if (ka->sa.sa_flags & SA_ONSTACK) {
+		if (sas_ss_flags(rsp) == 0)
+			rsp = current->sas_ss_sp + current->sas_ss_size;
+	}
+
+	/* This is the legacy signal stack switching. */
+	else if ((regs->ss & 0xffff) != __USER_DS &&
+		!(ka->sa.sa_flags & SA_RESTORER) &&
+		 ka->sa.sa_restorer) {
+		rsp = (unsigned long) ka->sa.sa_restorer;
+	}
+
+	return (void __user *)((rsp - frame_size) & -8UL);
+}
+
+void ia32_setup_frame(int sig, struct k_sigaction *ka,
+			compat_sigset_t *set, struct pt_regs * regs)
+{
+	struct sigframe __user *frame;
+	int err = 0;
+
+	frame = get_sigframe(ka, regs, sizeof(*frame));
+
+	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+		goto give_sigsegv;
+
+	{
+		struct exec_domain *ed = current_thread_info()->exec_domain;
+		err |= __put_user((ed
+		           && ed->signal_invmap
+		           && sig < 32
+		           ? ed->signal_invmap[sig]
+		           : sig),
+		          &frame->sig);
+	}
+	if (err)
+		goto give_sigsegv;
+
+	err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs,
+					set->sig[0]);
+	if (err)
+		goto give_sigsegv;
+
+	if (_COMPAT_NSIG_WORDS > 1) {
+		err |= __copy_to_user(frame->extramask, &set->sig[1],
+				      sizeof(frame->extramask));
+	}
+	if (err)
+		goto give_sigsegv;
+
+	/* Return stub is in 32bit vsyscall page */
+	{ 
+		void __user *restorer = VSYSCALL32_SIGRETURN; 
+		if (ka->sa.sa_flags & SA_RESTORER)
+			restorer = ka->sa.sa_restorer;       
+		err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
+	}
+	/* These are actually not used anymore, but left because some 
+	   gdb versions depend on them as a marker. */
+	{ 
+		/* copy_to_user optimizes that into a single 8 byte store */
+		static const struct { 
+			u16 poplmovl;
+			u32 val;
+			u16 int80;    
+			u16 pad; 
+		} __attribute__((packed)) code = { 
+			0xb858,		 /* popl %eax ; movl $...,%eax */
+			__NR_ia32_sigreturn,   
+			0x80cd,		/* int $0x80 */
+			0,
+		}; 
+		err |= __copy_to_user(frame->retcode, &code, 8); 
+	}
+	if (err)
+		goto give_sigsegv;
+
+	/* Set up registers for signal handler */
+	regs->rsp = (unsigned long) frame;
+	regs->rip = (unsigned long) ka->sa.sa_handler;
+
+	asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 
+	asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 
+
+	regs->cs = __USER32_CS; 
+	regs->ss = __USER32_DS; 
+
+	set_fs(USER_DS);
+	if (regs->eflags & TF_MASK) {
+		if (current->ptrace & PT_PTRACED) {
+			ptrace_notify(SIGTRAP);
+		} else {
+			regs->eflags &= ~TF_MASK;
+		}
+	}
+
+#if DEBUG_SIG
+	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+		current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+
+	return;
+
+give_sigsegv:
+	force_sigsegv(sig, current);
+}
+
+void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+			   compat_sigset_t *set, struct pt_regs * regs)
+{
+	struct rt_sigframe __user *frame;
+	int err = 0;
+
+	frame = get_sigframe(ka, regs, sizeof(*frame));
+
+	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+		goto give_sigsegv;
+
+	{
+		struct exec_domain *ed = current_thread_info()->exec_domain;
+		err |= __put_user((ed
+		    	   && ed->signal_invmap
+		    	   && sig < 32
+		    	   ? ed->signal_invmap[sig]
+			   : sig),
+			  &frame->sig);
+	}
+	err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
+	err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
+	err |= copy_siginfo_to_user32(&frame->info, info);
+	if (err)
+		goto give_sigsegv;
+
+	/* Create the ucontext.  */
+	err |= __put_user(0, &frame->uc.uc_flags);
+	err |= __put_user(0, &frame->uc.uc_link);
+	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+	err |= __put_user(sas_ss_flags(regs->rsp),
+			  &frame->uc.uc_stack.ss_flags);
+	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+			        regs, set->sig[0]);
+	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+	if (err)
+		goto give_sigsegv;
+
+	
+	{ 
+		void __user *restorer = VSYSCALL32_RTSIGRETURN; 
+		if (ka->sa.sa_flags & SA_RESTORER)
+			restorer = ka->sa.sa_restorer;       
+		err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
+	}
+
+	/* This is movl $,%eax ; int $0x80 */
+	/* Not actually used anymore, but left because some gdb versions
+	   need it. */ 
+	{ 
+		/* __copy_to_user optimizes that into a single 8 byte store */
+		static const struct { 
+			u8 movl; 
+			u32 val; 
+			u16 int80; 
+			u16 pad;
+			u8  pad2;				
+		} __attribute__((packed)) code = { 
+			0xb8,
+			__NR_ia32_rt_sigreturn,
+			0x80cd,
+			0,
+		}; 
+		err |= __copy_to_user(frame->retcode, &code, 8); 
+	} 
+	if (err)
+		goto give_sigsegv;
+
+	/* Set up registers for signal handler */
+	regs->rsp = (unsigned long) frame;
+	regs->rip = (unsigned long) ka->sa.sa_handler;
+
+	asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 
+	asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 
+	
+	regs->cs = __USER32_CS; 
+	regs->ss = __USER32_DS; 
+
+	set_fs(USER_DS);
+	if (regs->eflags & TF_MASK) {
+		if (current->ptrace & PT_PTRACED) {
+			ptrace_notify(SIGTRAP);
+		} else {
+			regs->eflags &= ~TF_MASK;
+		}
+	}
+
+#if DEBUG_SIG
+	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+		current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+
+	return;
+
+give_sigsegv:
+	force_sigsegv(sig, current);
+}
+
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
new file mode 100644
index 000000000000..f3ca0db85b5b
--- /dev/null
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -0,0 +1,602 @@
+/*
+ * Compatibility mode system call entry point for x86-64. 
+ * 		
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */		 
+
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/offset.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>	
+#include <asm/thread_info.h>	
+#include <asm/segment.h>
+#include <asm/vsyscall32.h>
+#include <linux/linkage.h>
+
+	.macro IA32_ARG_FIXUP noebp=0
+	movl	%edi,%r8d
+	.if \noebp
+	.else
+	movl	%ebp,%r9d
+	.endif
+	xchg	%ecx,%esi
+	movl	%ebx,%edi
+	movl	%edx,%edx	/* zero extension */
+	.endm 
+
+	/* clobbers %eax */	
+	.macro  CLEAR_RREGS
+	xorl 	%eax,%eax
+	movq	%rax,R11(%rsp)
+	movq	%rax,R10(%rsp)
+	movq	%rax,R9(%rsp)
+	movq	%rax,R8(%rsp)
+	.endm
+
+/*
+ * 32bit SYSENTER instruction entry.
+ *
+ * Arguments:
+ * %eax	System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp user stack
+ * 0(%ebp) Arg6	
+ * 	
+ * Interrupts off.
+ *	
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below.	Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */ 	
+ENTRY(ia32_sysenter_target)
+	CFI_STARTPROC
+	swapgs
+	movq	%gs:pda_kernelstack, %rsp
+	addq	$(PDA_STACKOFFSET),%rsp	
+	sti	
+ 	movl	%ebp,%ebp		/* zero extension */
+	pushq	$__USER32_DS
+	pushq	%rbp
+	pushfq
+	movl	$VSYSCALL32_SYSEXIT, %r10d
+	pushq	$__USER32_CS
+	movl	%eax, %eax
+	pushq	%r10
+	pushq	%rax
+	cld
+	SAVE_ARGS 0,0,1
+ 	/* no need to do an access_ok check here because rbp has been
+ 	   32bit zero extended */ 
+1:	movl	(%rbp),%r9d
+ 	.section __ex_table,"a"
+ 	.quad 1b,ia32_badarg
+ 	.previous	
+	GET_THREAD_INFO(%r10)
+	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	jnz  sysenter_tracesys
+sysenter_do_call:	
+	cmpl	$(IA32_NR_syscalls),%eax
+	jae	ia32_badsys
+	IA32_ARG_FIXUP 1
+	call	*ia32_sys_call_table(,%rax,8)
+	movq	%rax,RAX-ARGOFFSET(%rsp)
+	GET_THREAD_INFO(%r10)
+	cli
+	testl	$_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
+	jnz	int_ret_from_sys_call
+	/* clear IF, that popfq doesn't enable interrupts early */
+	andl  $~0x200,EFLAGS-R11(%rsp) 
+	RESTORE_ARGS 1,24,1,1,1,1
+	popfq
+	popq	%rcx				/* User %esp */
+	movl	$VSYSCALL32_SYSEXIT,%edx	/* User %eip */
+	swapgs
+	sti		/* sti only takes effect after the next instruction */
+	/* sysexit */
+	.byte	0xf, 0x35
+
+sysenter_tracesys:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	$-ENOSYS,RAX(%rsp)	/* really needed? */
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call	syscall_trace_enter
+	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	movl	%ebp, %ebp
+	/* no need to do an access_ok check here because rbp has been
+	   32bit zero extended */ 
+1:	movl	(%rbp),%r9d
+	.section __ex_table,"a"
+	.quad 1b,ia32_badarg
+	.previous
+	jmp	sysenter_do_call
+	CFI_ENDPROC
+
+/*
+ * 32bit SYSCALL instruction entry.
+ *
+ * Arguments:
+ * %eax	System call number.
+ * %ebx Arg1
+ * %ecx return EIP 
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
+ * %esp user stack 
+ * 0(%esp) Arg6
+ * 	
+ * Interrupts off.
+ *	
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below.	Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.	
+ */ 	
+ENTRY(ia32_cstar_target)
+	CFI_STARTPROC
+	swapgs
+	movl	%esp,%r8d
+	movq	%gs:pda_kernelstack,%rsp
+	sti
+	SAVE_ARGS 8,1,1
+	movl 	%eax,%eax	/* zero extension */
+	movq	%rax,ORIG_RAX-ARGOFFSET(%rsp)
+	movq	%rcx,RIP-ARGOFFSET(%rsp)
+	movq	%rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+	movl	%ebp,%ecx
+	movq	$__USER32_CS,CS-ARGOFFSET(%rsp)
+	movq	$__USER32_DS,SS-ARGOFFSET(%rsp)
+	movq	%r11,EFLAGS-ARGOFFSET(%rsp)
+	movq	%r8,RSP-ARGOFFSET(%rsp)	
+	/* no need to do an access_ok check here because r8 has been
+	   32bit zero extended */ 
+	/* hardware stack frame is complete now */	
+1:	movl	(%r8),%r9d
+	.section __ex_table,"a"
+	.quad 1b,ia32_badarg
+	.previous	
+	GET_THREAD_INFO(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	jnz   cstar_tracesys
+cstar_do_call:	
+	cmpl $IA32_NR_syscalls,%eax
+	jae  ia32_badsys
+	IA32_ARG_FIXUP 1
+	call *ia32_sys_call_table(,%rax,8)
+	movq %rax,RAX-ARGOFFSET(%rsp)
+	GET_THREAD_INFO(%r10)
+	cli
+	testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
+	jnz  int_ret_from_sys_call
+	RESTORE_ARGS 1,-ARG_SKIP,1,1,1
+	movl RIP-ARGOFFSET(%rsp),%ecx
+	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
+	movl RSP-ARGOFFSET(%rsp),%esp
+	swapgs
+	sysretl
+	
+cstar_tracesys:	
+	SAVE_REST
+	CLEAR_RREGS
+	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
+	movq %rsp,%rdi        /* &pt_regs -> arg1 */
+	call syscall_trace_enter
+	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	movl RSP-ARGOFFSET(%rsp), %r8d
+	/* no need to do an access_ok check here because r8 has been
+	   32bit zero extended */ 
+1:	movl	(%r8),%r9d
+	.section __ex_table,"a"
+	.quad 1b,ia32_badarg
+	.previous
+	jmp cstar_do_call
+				
+ia32_badarg:
+	movq $-EFAULT,%rax
+	jmp ia32_sysret
+	CFI_ENDPROC
+
+/* 
+ * Emulated IA32 system calls via int 0x80. 
+ *
+ * Arguments:	 
+ * %eax	System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.	
+ * All registers except %eax must be saved (but ptrace may violate that)
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts off.	
+ */ 				
+
+ENTRY(ia32_syscall)
+	CFI_STARTPROC
+	swapgs	
+	sti
+	movl %eax,%eax
+	pushq %rax
+	cld
+	/* note the registers are not zero extended to the sf.
+	   this could be a problem. */
+	SAVE_ARGS 0,0,1
+	GET_THREAD_INFO(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	jnz ia32_tracesys
+ia32_do_syscall:	
+	cmpl $(IA32_NR_syscalls),%eax
+	jae  ia32_badsys
+	IA32_ARG_FIXUP
+	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
+ia32_sysret:
+	movq %rax,RAX-ARGOFFSET(%rsp)
+	jmp int_ret_from_sys_call 
+
+ia32_tracesys:			 
+	SAVE_REST
+	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
+	movq %rsp,%rdi        /* &pt_regs -> arg1 */
+	call syscall_trace_enter
+	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp ia32_do_syscall
+
+ia32_badsys:
+	movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+	jmp int_ret_from_sys_call
+
+ni_syscall:
+	movq %rax,%rdi
+	jmp  sys32_ni_syscall			
+
+quiet_ni_syscall:
+	movq $-ENOSYS,%rax
+	ret
+	CFI_ENDPROC
+	
+	.macro PTREGSCALL label, func, arg
+	.globl \label
+\label:
+	leaq \func(%rip),%rax
+	leaq -ARGOFFSET+8(%rsp),\arg	/* 8 for return address */
+	jmp  ia32_ptregs_common	
+	.endm
+
+	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
+	PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
+	PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
+	PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
+	PTREGSCALL stub32_execve, sys32_execve, %rcx
+	PTREGSCALL stub32_fork, sys_fork, %rdi
+	PTREGSCALL stub32_clone, sys32_clone, %rdx
+	PTREGSCALL stub32_vfork, sys_vfork, %rdi
+	PTREGSCALL stub32_iopl, sys_iopl, %rsi
+	PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+
+ENTRY(ia32_ptregs_common)
+	CFI_STARTPROC
+	popq %r11
+	SAVE_REST
+	call *%rax
+	RESTORE_REST
+	jmp  ia32_sysret	/* misbalances the return cache */
+	CFI_ENDPROC
+
+	.data
+	.align 8
+	.globl ia32_sys_call_table
+ia32_sys_call_table:
+	.quad sys_restart_syscall
+	.quad sys_exit
+	.quad stub32_fork
+	.quad sys_read
+	.quad sys_write
+	.quad sys32_open		/* 5 */
+	.quad sys_close
+	.quad sys32_waitpid
+	.quad sys_creat
+	.quad sys_link
+	.quad sys_unlink		/* 10 */
+	.quad stub32_execve
+	.quad sys_chdir
+	.quad compat_sys_time
+	.quad sys_mknod
+	.quad sys_chmod		/* 15 */
+	.quad sys_lchown16
+	.quad quiet_ni_syscall			/* old break syscall holder */
+	.quad sys_stat
+	.quad sys32_lseek
+	.quad sys_getpid		/* 20 */
+	.quad compat_sys_mount	/* mount  */
+	.quad sys_oldumount	/* old_umount  */
+	.quad sys_setuid16
+	.quad sys_getuid16
+	.quad compat_sys_stime	/* stime */		/* 25 */
+	.quad sys32_ptrace	/* ptrace */
+	.quad sys_alarm
+	.quad sys_fstat	/* (old)fstat */
+	.quad sys_pause
+	.quad compat_sys_utime	/* 30 */
+	.quad quiet_ni_syscall	/* old stty syscall holder */
+	.quad quiet_ni_syscall	/* old gtty syscall holder */
+	.quad sys_access
+	.quad sys_nice	
+	.quad quiet_ni_syscall	/* 35 */	/* old ftime syscall holder */
+	.quad sys_sync
+	.quad sys32_kill
+	.quad sys_rename
+	.quad sys_mkdir
+	.quad sys_rmdir		/* 40 */
+	.quad sys_dup
+	.quad sys32_pipe
+	.quad compat_sys_times
+	.quad quiet_ni_syscall			/* old prof syscall holder */
+	.quad sys_brk		/* 45 */
+	.quad sys_setgid16
+	.quad sys_getgid16
+	.quad sys_signal
+	.quad sys_geteuid16
+	.quad sys_getegid16	/* 50 */
+	.quad sys_acct
+	.quad sys_umount			/* new_umount */
+	.quad quiet_ni_syscall			/* old lock syscall holder */
+	.quad compat_sys_ioctl
+	.quad compat_sys_fcntl64		/* 55 */
+	.quad quiet_ni_syscall			/* old mpx syscall holder */
+	.quad sys_setpgid
+	.quad quiet_ni_syscall			/* old ulimit syscall holder */
+	.quad sys32_olduname
+	.quad sys_umask		/* 60 */
+	.quad sys_chroot
+	.quad sys32_ustat
+	.quad sys_dup2
+	.quad sys_getppid
+	.quad sys_getpgrp		/* 65 */
+	.quad sys_setsid
+	.quad sys32_sigaction
+	.quad sys_sgetmask
+	.quad sys_ssetmask
+	.quad sys_setreuid16	/* 70 */
+	.quad sys_setregid16
+	.quad stub32_sigsuspend
+	.quad compat_sys_sigpending
+	.quad sys_sethostname
+	.quad compat_sys_setrlimit	/* 75 */
+	.quad compat_sys_old_getrlimit	/* old_getrlimit */
+	.quad compat_sys_getrusage
+	.quad sys32_gettimeofday
+	.quad sys32_settimeofday
+	.quad sys_getgroups16	/* 80 */
+	.quad sys_setgroups16
+	.quad sys32_old_select
+	.quad sys_symlink
+	.quad sys_lstat
+	.quad sys_readlink		/* 85 */
+#ifdef CONFIG_IA32_AOUT
+	.quad sys_uselib
+#else
+	.quad quiet_ni_syscall
+#endif
+	.quad sys_swapon
+	.quad sys_reboot
+	.quad compat_sys_old_readdir
+	.quad sys32_mmap		/* 90 */
+	.quad sys_munmap
+	.quad sys_truncate
+	.quad sys_ftruncate
+	.quad sys_fchmod
+	.quad sys_fchown16		/* 95 */
+	.quad sys_getpriority
+	.quad sys_setpriority
+	.quad quiet_ni_syscall			/* old profil syscall holder */
+	.quad compat_sys_statfs
+	.quad compat_sys_fstatfs		/* 100 */
+	.quad sys_ioperm
+	.quad compat_sys_socketcall
+	.quad sys_syslog
+	.quad compat_sys_setitimer
+	.quad compat_sys_getitimer	/* 105 */
+	.quad compat_sys_newstat
+	.quad compat_sys_newlstat
+	.quad compat_sys_newfstat
+	.quad sys32_uname
+	.quad stub32_iopl		/* 110 */
+	.quad sys_vhangup
+	.quad quiet_ni_syscall	/* old "idle" system call */
+	.quad sys32_vm86_warning	/* vm86old */ 
+	.quad compat_sys_wait4
+	.quad sys_swapoff		/* 115 */
+	.quad sys32_sysinfo
+	.quad sys32_ipc
+	.quad sys_fsync
+	.quad stub32_sigreturn
+	.quad stub32_clone		/* 120 */
+	.quad sys_setdomainname
+	.quad sys_uname
+	.quad sys_modify_ldt
+	.quad sys32_adjtimex
+	.quad sys32_mprotect		/* 125 */
+	.quad compat_sys_sigprocmask
+	.quad quiet_ni_syscall		/* create_module */
+	.quad sys_init_module
+	.quad sys_delete_module
+	.quad quiet_ni_syscall		/* 130  get_kernel_syms */
+	.quad sys_quotactl
+	.quad sys_getpgid
+	.quad sys_fchdir
+	.quad quiet_ni_syscall	/* bdflush */
+	.quad sys_sysfs		/* 135 */
+	.quad sys_personality
+	.quad quiet_ni_syscall	/* for afs_syscall */
+	.quad sys_setfsuid16
+	.quad sys_setfsgid16
+	.quad sys_llseek		/* 140 */
+	.quad compat_sys_getdents
+	.quad compat_sys_select
+	.quad sys_flock
+	.quad sys_msync
+	.quad compat_sys_readv		/* 145 */
+	.quad compat_sys_writev
+	.quad sys_getsid
+	.quad sys_fdatasync
+	.quad sys32_sysctl	/* sysctl */
+	.quad sys_mlock		/* 150 */
+	.quad sys_munlock
+	.quad sys_mlockall
+	.quad sys_munlockall
+	.quad sys_sched_setparam
+	.quad sys_sched_getparam   /* 155 */
+	.quad sys_sched_setscheduler
+	.quad sys_sched_getscheduler
+	.quad sys_sched_yield
+	.quad sys_sched_get_priority_max
+	.quad sys_sched_get_priority_min  /* 160 */
+	.quad sys_sched_rr_get_interval
+	.quad compat_sys_nanosleep
+	.quad sys_mremap
+	.quad sys_setresuid16
+	.quad sys_getresuid16	/* 165 */
+	.quad sys32_vm86_warning	/* vm86 */ 
+	.quad quiet_ni_syscall	/* query_module */
+	.quad sys_poll
+	.quad compat_sys_nfsservctl
+	.quad sys_setresgid16	/* 170 */
+	.quad sys_getresgid16
+	.quad sys_prctl
+	.quad stub32_rt_sigreturn
+	.quad sys32_rt_sigaction
+	.quad sys32_rt_sigprocmask	/* 175 */
+	.quad sys32_rt_sigpending
+	.quad compat_sys_rt_sigtimedwait
+	.quad sys32_rt_sigqueueinfo
+	.quad stub32_rt_sigsuspend
+	.quad sys32_pread		/* 180 */
+	.quad sys32_pwrite
+	.quad sys_chown16
+	.quad sys_getcwd
+	.quad sys_capget
+	.quad sys_capset
+	.quad stub32_sigaltstack
+	.quad sys32_sendfile
+	.quad quiet_ni_syscall		/* streams1 */
+	.quad quiet_ni_syscall		/* streams2 */
+	.quad stub32_vfork            /* 190 */
+	.quad compat_sys_getrlimit
+	.quad sys32_mmap2
+	.quad sys32_truncate64
+	.quad sys32_ftruncate64
+	.quad sys32_stat64		/* 195 */
+	.quad sys32_lstat64
+	.quad sys32_fstat64
+	.quad sys_lchown
+	.quad sys_getuid
+	.quad sys_getgid		/* 200 */
+	.quad sys_geteuid
+	.quad sys_getegid
+	.quad sys_setreuid
+	.quad sys_setregid
+	.quad sys_getgroups	/* 205 */
+	.quad sys_setgroups
+	.quad sys_fchown
+	.quad sys_setresuid
+	.quad sys_getresuid
+	.quad sys_setresgid	/* 210 */
+	.quad sys_getresgid
+	.quad sys_chown
+	.quad sys_setuid
+	.quad sys_setgid
+	.quad sys_setfsuid		/* 215 */
+	.quad sys_setfsgid
+	.quad sys_pivot_root
+	.quad sys_mincore
+	.quad sys_madvise
+	.quad compat_sys_getdents64	/* 220 getdents64 */
+	.quad compat_sys_fcntl64	
+	.quad quiet_ni_syscall		/* tux */
+	.quad quiet_ni_syscall    	/* security */
+	.quad sys_gettid	
+	.quad sys_readahead	/* 225 */
+	.quad sys_setxattr
+	.quad sys_lsetxattr
+	.quad sys_fsetxattr
+	.quad sys_getxattr
+	.quad sys_lgetxattr	/* 230 */
+	.quad sys_fgetxattr
+	.quad sys_listxattr
+	.quad sys_llistxattr
+	.quad sys_flistxattr
+	.quad sys_removexattr	/* 235 */
+	.quad sys_lremovexattr
+	.quad sys_fremovexattr
+	.quad sys_tkill
+	.quad sys_sendfile64 
+	.quad compat_sys_futex		/* 240 */
+	.quad compat_sys_sched_setaffinity
+	.quad compat_sys_sched_getaffinity
+	.quad sys32_set_thread_area
+	.quad sys32_get_thread_area
+	.quad compat_sys_io_setup	/* 245 */
+	.quad sys_io_destroy
+	.quad compat_sys_io_getevents
+	.quad compat_sys_io_submit
+	.quad sys_io_cancel
+	.quad sys_fadvise64		/* 250 */
+	.quad quiet_ni_syscall 	/* free_huge_pages */
+	.quad sys_exit_group
+	.quad sys32_lookup_dcookie
+	.quad sys_epoll_create
+	.quad sys_epoll_ctl		/* 255 */
+	.quad sys_epoll_wait
+	.quad sys_remap_file_pages
+	.quad sys_set_tid_address
+	.quad sys32_timer_create
+	.quad compat_sys_timer_settime	/* 260 */
+	.quad compat_sys_timer_gettime
+	.quad sys_timer_getoverrun
+	.quad sys_timer_delete
+	.quad compat_sys_clock_settime
+	.quad compat_sys_clock_gettime	/* 265 */
+	.quad compat_sys_clock_getres
+	.quad compat_sys_clock_nanosleep
+	.quad compat_sys_statfs64
+	.quad compat_sys_fstatfs64
+	.quad sys_tgkill		/* 270 */
+	.quad compat_sys_utimes
+	.quad sys32_fadvise64_64
+	.quad quiet_ni_syscall	/* sys_vserver */
+	.quad sys_mbind
+	.quad compat_sys_get_mempolicy	/* 275 */
+	.quad sys_set_mempolicy
+	.quad compat_sys_mq_open
+	.quad sys_mq_unlink
+	.quad compat_sys_mq_timedsend
+	.quad compat_sys_mq_timedreceive	/* 280 */
+	.quad compat_sys_mq_notify
+	.quad compat_sys_mq_getsetattr
+	.quad quiet_ni_syscall		/* reserved for kexec */
+	.quad compat_sys_waitid
+	.quad quiet_ni_syscall		/* sys_altroot */
+	.quad sys_add_key
+	.quad sys_request_key
+	.quad sys_keyctl
+	/* don't forget to change IA32_NR_syscalls */
+ia32_syscall_end:		
+	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
+		.quad ni_syscall
+	.endr
diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c
new file mode 100644
index 000000000000..369151dc3213
--- /dev/null
+++ b/arch/x86_64/ia32/ipc32.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/ipc.h>
+#include <linux/compat.h>
+
+#include <asm-i386/ipc.h>
+
+asmlinkage long
+sys32_ipc(u32 call, int first, int second, int third,
+		compat_uptr_t ptr, u32 fifth)
+{
+	int version;
+
+	version = call >> 16; /* hack for backward compatibility */
+	call &= 0xffff;
+
+	switch (call) {
+	      case SEMOP:
+		/* struct sembuf is the same on 32 and 64bit :)) */
+		return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
+	      case SEMTIMEDOP:
+		return compat_sys_semtimedop(first, compat_ptr(ptr), second,
+						compat_ptr(fifth));
+	      case SEMGET:
+		return sys_semget(first, second, third);
+	      case SEMCTL:
+		return compat_sys_semctl(first, second, third, compat_ptr(ptr));
+
+	      case MSGSND:
+		return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
+	      case MSGRCV:
+		return compat_sys_msgrcv(first, second, fifth, third,
+					 version, compat_ptr(ptr));
+	      case MSGGET:
+		return sys_msgget((key_t) first, second);
+	      case MSGCTL:
+		return compat_sys_msgctl(first, second, compat_ptr(ptr));
+
+	      case SHMAT:
+		return compat_sys_shmat(first, second, third, version,
+					compat_ptr(ptr));
+		break;
+	      case SHMDT:
+		return sys_shmdt(compat_ptr(ptr));
+	      case SHMGET:
+		return sys_shmget(first, (unsigned)second, third);
+	      case SHMCTL:
+		return compat_sys_shmctl(first, second, compat_ptr(ptr));
+	}
+	return -ENOSYS;
+}
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
new file mode 100644
index 000000000000..b98b6d2462f6
--- /dev/null
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -0,0 +1,379 @@
+/* 
+ * 32bit ptrace for x86-64.
+ *
+ * Copyright 2001,2002 Andi Kleen, SuSE Labs.
+ * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier 
+ * copyright.
+ * 
+ * This allows to access 64bit processes too; but there is no way to see the extended 
+ * register contents.
+ *
+ * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $
+ */ 
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/mm.h>
+#include <linux/ptrace.h>
+#include <asm/ptrace.h>
+#include <asm/compat.h>
+#include <asm/uaccess.h>
+#include <asm/user32.h>
+#include <asm/user.h>
+#include <asm/errno.h>
+#include <asm/debugreg.h>
+#include <asm/i387.h>
+#include <asm/fpu32.h>
+
+/* determines which flags the user has access to. */
+/* 1 = access 0 = no access */
+#define FLAG_MASK 0x44dd5UL
+
+#define R32(l,q) \
+	case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
+
+static int putreg32(struct task_struct *child, unsigned regno, u32 val)
+{
+	int i;
+	__u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs)); 
+
+	switch (regno) {
+	case offsetof(struct user32, regs.fs):
+		if (val && (val & 3) != 3) return -EIO; 
+		child->thread.fs = val & 0xffff; 
+		break;
+	case offsetof(struct user32, regs.gs):
+		if (val && (val & 3) != 3) return -EIO; 
+		child->thread.gs = val & 0xffff;
+		break;
+	case offsetof(struct user32, regs.ds):
+		if (val && (val & 3) != 3) return -EIO; 
+		child->thread.ds = val & 0xffff;
+		break;
+	case offsetof(struct user32, regs.es):
+		child->thread.es = val & 0xffff;
+		break;
+	case offsetof(struct user32, regs.ss): 
+		if ((val & 3) != 3) return -EIO;
+        	stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
+		break;
+	case offsetof(struct user32, regs.cs): 
+		if ((val & 3) != 3) return -EIO;
+		stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
+		break;
+
+	R32(ebx, rbx); 
+	R32(ecx, rcx);
+	R32(edx, rdx);
+	R32(edi, rdi);
+	R32(esi, rsi);
+	R32(ebp, rbp);
+	R32(eax, rax);
+	R32(orig_eax, orig_rax);
+	R32(eip, rip);
+	R32(esp, rsp);
+
+	case offsetof(struct user32, regs.eflags): {
+		__u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
+		val &= FLAG_MASK;
+		*flags = val | (*flags & ~FLAG_MASK);
+		break;
+	}
+
+	case offsetof(struct user32, u_debugreg[4]): 
+	case offsetof(struct user32, u_debugreg[5]):
+		return -EIO;
+
+	case offsetof(struct user32, u_debugreg[0]):
+		child->thread.debugreg0 = val;
+		break;
+
+	case offsetof(struct user32, u_debugreg[1]):
+		child->thread.debugreg1 = val;
+		break;
+
+	case offsetof(struct user32, u_debugreg[2]):
+		child->thread.debugreg2 = val;
+		break;
+
+	case offsetof(struct user32, u_debugreg[3]):
+		child->thread.debugreg3 = val;
+		break;
+
+	case offsetof(struct user32, u_debugreg[6]):
+		child->thread.debugreg6 = val;
+		break; 
+
+	case offsetof(struct user32, u_debugreg[7]):
+		val &= ~DR_CONTROL_RESERVED;
+		/* See arch/i386/kernel/ptrace.c for an explanation of
+		 * this awkward check.*/
+		for(i=0; i<4; i++)
+			if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
+			       return -EIO;
+		child->thread.debugreg7 = val; 
+		break; 
+		    
+	default:
+		if (regno > sizeof(struct user32) || (regno & 3))
+			return -EIO;
+	       
+		/* Other dummy fields in the virtual user structure are ignored */ 
+		break; 		
+	}
+	return 0;
+}
+
+#undef R32
+
+#define R32(l,q) \
+	case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break
+
+static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
+{
+	__u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs)); 
+
+	switch (regno) {
+	case offsetof(struct user32, regs.fs):
+	        *val = child->thread.fs; 
+		break;
+	case offsetof(struct user32, regs.gs):
+		*val = child->thread.gs;
+		break;
+	case offsetof(struct user32, regs.ds):
+		*val = child->thread.ds;
+		break;
+	case offsetof(struct user32, regs.es):
+		*val = child->thread.es;
+		break;
+
+	R32(cs, cs);
+	R32(ss, ss);
+	R32(ebx, rbx); 
+	R32(ecx, rcx);
+	R32(edx, rdx);
+	R32(edi, rdi);
+	R32(esi, rsi);
+	R32(ebp, rbp);
+	R32(eax, rax);
+	R32(orig_eax, orig_rax);
+	R32(eip, rip);
+	R32(eflags, eflags);
+	R32(esp, rsp);
+
+	case offsetof(struct user32, u_debugreg[0]): 
+		*val = child->thread.debugreg0; 
+		break; 
+	case offsetof(struct user32, u_debugreg[1]): 
+		*val = child->thread.debugreg1; 
+		break; 
+	case offsetof(struct user32, u_debugreg[2]): 
+		*val = child->thread.debugreg2; 
+		break; 
+	case offsetof(struct user32, u_debugreg[3]): 
+		*val = child->thread.debugreg3; 
+		break; 
+	case offsetof(struct user32, u_debugreg[6]): 
+		*val = child->thread.debugreg6; 
+		break; 
+	case offsetof(struct user32, u_debugreg[7]): 
+		*val = child->thread.debugreg7; 
+		break; 
+		    
+	default:
+		if (regno > sizeof(struct user32) || (regno & 3))
+			return -EIO;
+
+		/* Other dummy fields in the virtual user structure are ignored */ 
+		*val = 0;
+		break; 		
+	}
+	return 0;
+}
+
+#undef R32
+
+static struct task_struct *find_target(int request, int pid, int *err)
+{ 
+	struct task_struct *child;
+
+	*err = -EPERM; 
+	if (pid == 1)
+		return NULL; 
+
+	*err = -ESRCH;
+	read_lock(&tasklist_lock);
+	child = find_task_by_pid(pid);
+	if (child)
+		get_task_struct(child);
+	read_unlock(&tasklist_lock);
+	if (child) { 
+		*err = -EPERM;
+		if (child->pid == 1) 
+			goto out;
+		*err = ptrace_check_attach(child, request == PTRACE_KILL); 
+		if (*err < 0) 
+			goto out;
+		return child; 
+	} 
+ out:
+	if (child)
+	put_task_struct(child);
+	return NULL; 
+	
+} 
+
+asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
+{
+	struct task_struct *child;
+	struct pt_regs *childregs; 
+	void __user *datap = compat_ptr(data);
+	int ret;
+	__u32 val;
+
+	switch (request) { 
+	default:
+		return sys_ptrace(request, pid, addr, data); 
+
+	case PTRACE_PEEKTEXT:
+	case PTRACE_PEEKDATA:
+	case PTRACE_POKEDATA:
+	case PTRACE_POKETEXT:
+	case PTRACE_POKEUSR:       
+	case PTRACE_PEEKUSR:
+	case PTRACE_GETREGS:
+	case PTRACE_SETREGS:
+	case PTRACE_SETFPREGS:
+	case PTRACE_GETFPREGS:
+	case PTRACE_SETFPXREGS:
+	case PTRACE_GETFPXREGS:
+	case PTRACE_GETEVENTMSG:
+		break;
+	} 
+
+	child = find_target(request, pid, &ret);
+	if (!child)
+		return ret;
+
+	childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs)); 
+
+	switch (request) {
+	case PTRACE_PEEKDATA:
+	case PTRACE_PEEKTEXT:
+		ret = 0;
+		if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32))
+			ret = -EIO;
+		else
+			ret = put_user(val, (unsigned int __user *)datap); 
+		break; 
+
+	case PTRACE_POKEDATA:
+	case PTRACE_POKETEXT:
+		ret = 0;
+		if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32))
+			ret = -EIO; 
+		break;
+
+	case PTRACE_PEEKUSR:
+		ret = getreg32(child, addr, &val);
+		if (ret == 0)
+			ret = put_user(val, (__u32 __user *)datap);
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = putreg32(child, addr, data);
+		break;
+
+	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+		int i;
+	  	if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
+			ret = -EIO;
+			break;
+		}
+		ret = 0;
+		for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) {
+			getreg32(child, i, &val);
+			ret |= __put_user(val,(u32 __user *)datap);
+			datap += sizeof(u32);
+		}
+		break;
+	}
+
+	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+		unsigned long tmp;
+		int i;
+	  	if (!access_ok(VERIFY_READ, datap, 16*4)) {
+			ret = -EIO;
+			break;
+		}
+		ret = 0; 
+		for ( i = 0; i <= 16*4; i += sizeof(u32) ) {
+			ret |= __get_user(tmp, (u32 __user *)datap);
+			putreg32(child, i, tmp);
+			datap += sizeof(u32);
+		}
+		break;
+	}
+
+	case PTRACE_GETFPREGS:
+		ret = -EIO; 
+		if (!access_ok(VERIFY_READ, compat_ptr(data), 
+			       sizeof(struct user_i387_struct)))
+			break;
+		save_i387_ia32(child, datap, childregs, 1);
+		ret = 0; 
+			break;
+
+	case PTRACE_SETFPREGS:
+		ret = -EIO;
+		if (!access_ok(VERIFY_WRITE, datap, 
+			       sizeof(struct user_i387_struct)))
+			break;
+		ret = 0;
+		/* don't check EFAULT to be bug-to-bug compatible to i386 */
+		restore_i387_ia32(child, datap, 1);
+		break;
+
+	case PTRACE_GETFPXREGS: { 
+		struct user32_fxsr_struct __user *u = datap;
+		init_fpu(child); 
+		ret = -EIO;
+		if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
+			break;
+			ret = -EFAULT;
+		if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
+			break;
+		ret = __put_user(childregs->cs, &u->fcs);
+		ret |= __put_user(child->thread.ds, &u->fos); 
+		break; 
+	} 
+	case PTRACE_SETFPXREGS: { 
+		struct user32_fxsr_struct __user *u = datap;
+		unlazy_fpu(child);
+		ret = -EIO;
+		if (!access_ok(VERIFY_READ, u, sizeof(*u)))
+			break;
+		/* no checking to be bug-to-bug compatible with i386 */
+		__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u));
+		set_stopped_child_used_math(child);
+		child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+		ret = 0; 
+		break;
+	}
+
+	case PTRACE_GETEVENTMSG:
+		ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data));
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	put_task_struct(child);
+	return ret;
+}
+
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
new file mode 100644
index 000000000000..68a9ab06ee7c
--- /dev/null
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -0,0 +1,1050 @@
+/*
+ * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
+ *             sys_sparc32 
+ *
+ * Copyright (C) 2000		VA Linux Co
+ * Copyright (C) 2000		Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 1999 		Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998 	Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 		David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2000		Hewlett-Packard Co.
+ * Copyright (C) 2000		David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000,2001,2002	Andi Kleen, SuSE Labs (x86-64 port) 
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment. In 2.5 most of this should be moved to a generic directory. 
+ *
+ * This file assumes that there is a hole at the end of user address space.
+ * 
+ * Some of the functions are LE specific currently. These are hopefully all marked.
+ * This should be fixed.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h> 
+#include <linux/file.h> 
+#include <linux/signal.h>
+#include <linux/syscalls.h>
+#include <linux/resource.h>
+#include <linux/times.h>
+#include <linux/utsname.h>
+#include <linux/timex.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/nfs_fs.h>
+#include <linux/quota.h>
+#include <linux/module.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfsd/nfsd.h>
+#include <linux/nfsd/cache.h>
+#include <linux/nfsd/xdr.h>
+#include <linux/nfsd/syscall.h>
+#include <linux/poll.h>
+#include <linux/personality.h>
+#include <linux/stat.h>
+#include <linux/ipc.h>
+#include <linux/rwsem.h>
+#include <linux/binfmts.h>
+#include <linux/init.h>
+#include <linux/aio_abi.h>
+#include <linux/aio.h>
+#include <linux/compat.h>
+#include <linux/vfs.h>
+#include <linux/ptrace.h>
+#include <linux/highuid.h>
+#include <linux/vmalloc.h>
+#include <asm/mman.h>
+#include <asm/types.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <asm/atomic.h>
+#include <asm/ldt.h>
+
+#include <net/scm.h>
+#include <net/sock.h>
+#include <asm/ia32.h>
+
+#define AA(__x)		((unsigned long)(__x))
+
+int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
+{
+	typeof(ubuf->st_uid) uid = 0;
+	typeof(ubuf->st_gid) gid = 0;
+	SET_UID(uid, kbuf->uid);
+	SET_GID(gid, kbuf->gid);
+	if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
+		return -EOVERFLOW;
+	if (kbuf->size >= 0x7fffffff)
+		return -EOVERFLOW;
+	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
+	    __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
+	    __put_user (kbuf->ino, &ubuf->st_ino) ||
+	    __put_user (kbuf->mode, &ubuf->st_mode) ||
+	    __put_user (kbuf->nlink, &ubuf->st_nlink) ||
+	    __put_user (uid, &ubuf->st_uid) ||
+	    __put_user (gid, &ubuf->st_gid) ||
+	    __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
+	    __put_user (kbuf->size, &ubuf->st_size) ||
+	    __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) ||
+	    __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+	    __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
+	    __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+	    __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
+	    __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+	    __put_user (kbuf->blksize, &ubuf->st_blksize) ||
+	    __put_user (kbuf->blocks, &ubuf->st_blocks))
+		return -EFAULT;
+	return 0;
+}
+
+asmlinkage long
+sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high)
+{
+       return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
+}
+
+asmlinkage long
+sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high)
+{
+       return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
+}
+
+/* Another set for IA32/LFS -- x86_64 struct stat is different due to 
+   support for 64bit inode numbers. */
+
+static int
+cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
+{
+	typeof(ubuf->st_uid) uid = 0;
+	typeof(ubuf->st_gid) gid = 0;
+	SET_UID(uid, stat->uid);
+	SET_GID(gid, stat->gid);
+	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
+	    __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
+	    __put_user (stat->ino, &ubuf->__st_ino) ||
+	    __put_user (stat->ino, &ubuf->st_ino) ||
+	    __put_user (stat->mode, &ubuf->st_mode) ||
+	    __put_user (stat->nlink, &ubuf->st_nlink) ||
+	    __put_user (uid, &ubuf->st_uid) ||
+	    __put_user (gid, &ubuf->st_gid) ||
+	    __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
+	    __put_user (stat->size, &ubuf->st_size) ||
+	    __put_user (stat->atime.tv_sec, &ubuf->st_atime) ||
+	    __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+	    __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) ||
+	    __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+	    __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) ||
+	    __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+	    __put_user (stat->blksize, &ubuf->st_blksize) ||
+	    __put_user (stat->blocks, &ubuf->st_blocks))
+		return -EFAULT;
+	return 0;
+}
+
+asmlinkage long
+sys32_stat64(char __user * filename, struct stat64 __user *statbuf)
+{
+	struct kstat stat;
+	int ret = vfs_stat(filename, &stat);
+	if (!ret)
+		ret = cp_stat64(statbuf, &stat);
+	return ret;
+}
+
+asmlinkage long
+sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
+{
+	struct kstat stat;
+	int ret = vfs_lstat(filename, &stat);
+	if (!ret)
+		ret = cp_stat64(statbuf, &stat);
+	return ret;
+}
+
+asmlinkage long
+sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
+{
+	struct kstat stat;
+	int ret = vfs_fstat(fd, &stat);
+	if (!ret)
+		ret = cp_stat64(statbuf, &stat);
+	return ret;
+}
+
+/*
+ * Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+
+struct mmap_arg_struct {
+	unsigned int addr;
+	unsigned int len;
+	unsigned int prot;
+	unsigned int flags;
+	unsigned int fd;
+	unsigned int offset;
+};
+
+asmlinkage long
+sys32_mmap(struct mmap_arg_struct __user *arg)
+{
+	struct mmap_arg_struct a;
+	struct file *file = NULL;
+	unsigned long retval;
+	struct mm_struct *mm ;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+
+	if (a.offset & ~PAGE_MASK)
+		return -EINVAL; 
+
+	if (!(a.flags & MAP_ANONYMOUS)) {
+		file = fget(a.fd);
+		if (!file)
+			return -EBADF;
+	}
+	
+	mm = current->mm; 
+	down_write(&mm->mmap_sem); 
+	retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT);
+	if (file)
+		fput(file);
+
+	up_write(&mm->mmap_sem); 
+
+	return retval;
+}
+
+asmlinkage long 
+sys32_mprotect(unsigned long start, size_t len, unsigned long prot)
+{
+	return sys_mprotect(start,len,prot); 
+}
+
+asmlinkage long
+sys32_pipe(int __user *fd)
+{
+	int retval;
+	int fds[2];
+
+	retval = do_pipe(fds);
+	if (retval)
+		goto out;
+	if (copy_to_user(fd, fds, sizeof(fds)))
+		retval = -EFAULT;
+  out:
+	return retval;
+}
+
+asmlinkage long
+sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
+		   struct sigaction32 __user *oact,  unsigned int sigsetsize)
+{
+	struct k_sigaction new_ka, old_ka;
+	int ret;
+	compat_sigset_t set32;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(compat_sigset_t))
+		return -EINVAL;
+
+	if (act) {
+		compat_uptr_t handler, restorer;
+
+		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+		    __get_user(handler, &act->sa_handler) ||
+		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+		    __get_user(restorer, &act->sa_restorer)||
+		    __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)))
+			return -EFAULT;
+		new_ka.sa.sa_handler = compat_ptr(handler);
+		new_ka.sa.sa_restorer = compat_ptr(restorer);
+		/* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
+		switch (_NSIG_WORDS) {
+		case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
+				| (((long)set32.sig[7]) << 32);
+		case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4]
+				| (((long)set32.sig[5]) << 32);
+		case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2]
+				| (((long)set32.sig[3]) << 32);
+		case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0]
+				| (((long)set32.sig[1]) << 32);
+		}
+	}
+
+	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+	if (!ret && oact) {
+		/* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
+		switch (_NSIG_WORDS) {
+		case 4:
+			set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
+			set32.sig[6] = old_ka.sa.sa_mask.sig[3];
+		case 3:
+			set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32);
+			set32.sig[4] = old_ka.sa.sa_mask.sig[2];
+		case 2:
+			set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32);
+			set32.sig[2] = old_ka.sa.sa_mask.sig[1];
+		case 1:
+			set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32);
+			set32.sig[0] = old_ka.sa.sa_mask.sig[0];
+		}
+		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+		    __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
+		    __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
+		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+		    __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)))
+			return -EFAULT;
+	}
+
+	return ret;
+}
+
+asmlinkage long
+sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact)
+{
+        struct k_sigaction new_ka, old_ka;
+        int ret;
+
+        if (act) {
+		compat_old_sigset_t mask;
+		compat_uptr_t handler, restorer;
+
+		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+		    __get_user(handler, &act->sa_handler) ||
+		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+		    __get_user(restorer, &act->sa_restorer) ||
+		    __get_user(mask, &act->sa_mask))
+			return -EFAULT;
+
+		new_ka.sa.sa_handler = compat_ptr(handler);
+		new_ka.sa.sa_restorer = compat_ptr(restorer);
+
+		siginitset(&new_ka.sa.sa_mask, mask);
+        }
+
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+	if (!ret && oact) {
+		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+		    __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
+		    __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
+		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+		    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+			return -EFAULT;
+        }
+
+	return ret;
+}
+
+asmlinkage long
+sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
+			compat_sigset_t __user *oset, unsigned int sigsetsize)
+{
+	sigset_t s;
+	compat_sigset_t s32;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+	
+	if (set) {
+		if (copy_from_user (&s32, set, sizeof(compat_sigset_t)))
+			return -EFAULT;
+		switch (_NSIG_WORDS) {
+		case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
+		case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32);
+		case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32);
+		case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
+		}
+	}
+	set_fs (KERNEL_DS);
+	ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL,
+				 sigsetsize); 
+	set_fs (old_fs);
+	if (ret) return ret;
+	if (oset) {
+		switch (_NSIG_WORDS) {
+		case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
+		case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
+		case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
+		case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
+		}
+		if (copy_to_user (oset, &s32, sizeof(compat_sigset_t)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static inline long
+get_tv32(struct timeval *o, struct compat_timeval __user *i)
+{
+	int err = -EFAULT; 
+	if (access_ok(VERIFY_READ, i, sizeof(*i))) { 
+		err = __get_user(o->tv_sec, &i->tv_sec);
+		err |= __get_user(o->tv_usec, &i->tv_usec);
+	}
+	return err; 
+}
+
+static inline long
+put_tv32(struct compat_timeval __user *o, struct timeval *i)
+{
+	int err = -EFAULT;
+	if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { 
+		err = __put_user(i->tv_sec, &o->tv_sec);
+		err |= __put_user(i->tv_usec, &o->tv_usec);
+	} 
+	return err; 
+}
+
+extern int do_setitimer(int which, struct itimerval *, struct itimerval *);
+
+asmlinkage long
+sys32_alarm(unsigned int seconds)
+{
+	struct itimerval it_new, it_old;
+	unsigned int oldalarm;
+
+	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+	it_new.it_value.tv_sec = seconds;
+	it_new.it_value.tv_usec = 0;
+	do_setitimer(ITIMER_REAL, &it_new, &it_old);
+	oldalarm = it_old.it_value.tv_sec;
+	/* ehhh.. We can't return 0 if we have an alarm pending.. */
+	/* And we'd better return too much than too little anyway */
+	if (it_old.it_value.tv_usec)
+		oldalarm++;
+	return oldalarm;
+}
+
+/* Translations due to time_t size differences.  Which affects all
+   sorts of things, like timeval and itimerval.  */
+
+extern struct timezone sys_tz;
+
+asmlinkage long
+sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+{
+	if (tv) {
+		struct timeval ktv;
+		do_gettimeofday(&ktv);
+		if (put_tv32(tv, &ktv))
+			return -EFAULT;
+	}
+	if (tz) {
+		if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+asmlinkage long
+sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+{
+	struct timeval ktv;
+	struct timespec kts;
+	struct timezone ktz;
+
+ 	if (tv) {
+		if (get_tv32(&ktv, tv))
+			return -EFAULT;
+		kts.tv_sec = ktv.tv_sec;
+		kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
+	}
+	if (tz) {
+		if (copy_from_user(&ktz, tz, sizeof(ktz)))
+			return -EFAULT;
+	}
+
+	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+}
+
+struct sel_arg_struct {
+	unsigned int n;
+	unsigned int inp;
+	unsigned int outp;
+	unsigned int exp;
+	unsigned int tvp;
+};
+
+asmlinkage long
+sys32_old_select(struct sel_arg_struct __user *arg)
+{
+	struct sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+				 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
+extern asmlinkage long
+compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options,
+		 struct compat_rusage *ru);
+
+asmlinkage long
+sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
+{
+	return compat_sys_wait4(pid, stat_addr, options, NULL);
+}
+
+int sys32_ni_syscall(int call)
+{ 
+	struct task_struct *me = current;
+	static char lastcomm[sizeof(me->comm)];
+
+	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
+		printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
+		       call, me->comm);
+		strncpy(lastcomm, me->comm, sizeof(lastcomm));
+	} 
+	return -ENOSYS;	       
+} 
+
+/* 32-bit timeval and related flotsam.  */
+
+asmlinkage long
+sys32_sysfs(int option, u32 arg1, u32 arg2)
+{
+	return sys_sysfs(option, arg1, arg2);
+}
+
+struct sysinfo32 {
+        s32 uptime;
+        u32 loads[3];
+        u32 totalram;
+        u32 freeram;
+        u32 sharedram;
+        u32 bufferram;
+        u32 totalswap;
+        u32 freeswap;
+        unsigned short procs;
+	unsigned short pad; 
+        u32 totalhigh;
+        u32 freehigh;
+        u32 mem_unit;
+        char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+
+asmlinkage long
+sys32_sysinfo(struct sysinfo32 __user *info)
+{
+	struct sysinfo s;
+	int ret;
+	mm_segment_t old_fs = get_fs ();
+	int bitcount = 0;
+	
+	set_fs (KERNEL_DS);
+	ret = sys_sysinfo(&s);
+	set_fs (old_fs);
+
+        /* Check to see if any memory value is too large for 32-bit and scale
+	 *  down if needed
+	 */
+	if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+		while (s.mem_unit < PAGE_SIZE) {
+			s.mem_unit <<= 1;
+			bitcount++;
+		}
+		s.totalram >>= bitcount;
+		s.freeram >>= bitcount;
+		s.sharedram >>= bitcount;
+		s.bufferram >>= bitcount;
+		s.totalswap >>= bitcount;
+		s.freeswap >>= bitcount;
+		s.totalhigh >>= bitcount;
+		s.freehigh >>= bitcount;
+	}
+
+	if (!access_ok(VERIFY_WRITE, info, sizeof(struct sysinfo32)) ||
+	    __put_user (s.uptime, &info->uptime) ||
+	    __put_user (s.loads[0], &info->loads[0]) ||
+	    __put_user (s.loads[1], &info->loads[1]) ||
+	    __put_user (s.loads[2], &info->loads[2]) ||
+	    __put_user (s.totalram, &info->totalram) ||
+	    __put_user (s.freeram, &info->freeram) ||
+	    __put_user (s.sharedram, &info->sharedram) ||
+	    __put_user (s.bufferram, &info->bufferram) ||
+	    __put_user (s.totalswap, &info->totalswap) ||
+	    __put_user (s.freeswap, &info->freeswap) ||
+	    __put_user (s.procs, &info->procs) ||
+	    __put_user (s.totalhigh, &info->totalhigh) || 
+	    __put_user (s.freehigh, &info->freehigh) ||
+	    __put_user (s.mem_unit, &info->mem_unit))
+		return -EFAULT;
+	return 0;
+}
+                
+asmlinkage long
+sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval)
+{
+	struct timespec t;
+	int ret;
+	mm_segment_t old_fs = get_fs ();
+	
+	set_fs (KERNEL_DS);
+	ret = sys_sched_rr_get_interval(pid, &t);
+	set_fs (old_fs);
+	if (put_compat_timespec(&t, interval))
+		return -EFAULT;
+	return ret;
+}
+
+asmlinkage long
+sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
+{
+	sigset_t s;
+	compat_sigset_t s32;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+		
+	set_fs (KERNEL_DS);
+	ret = sys_rt_sigpending(&s, sigsetsize);
+	set_fs (old_fs);
+	if (!ret) {
+		switch (_NSIG_WORDS) {
+		case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
+		case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
+		case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
+		case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
+		}
+		if (copy_to_user (set, &s32, sizeof(compat_sigset_t)))
+			return -EFAULT;
+	}
+	return ret;
+}
+
+asmlinkage long
+sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
+{
+	siginfo_t info;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+	
+	if (copy_siginfo_from_user32(&info, uinfo))
+		return -EFAULT;
+	set_fs (KERNEL_DS);
+	ret = sys_rt_sigqueueinfo(pid, sig, &info);
+	set_fs (old_fs);
+	return ret;
+}
+
+/* These are here just in case some old ia32 binary calls it. */
+asmlinkage long
+sys32_pause(void)
+{
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	return -ERESTARTNOHAND;
+}
+
+
+#ifdef CONFIG_SYSCTL
+struct sysctl_ia32 {
+	unsigned int	name;
+	int		nlen;
+	unsigned int	oldval;
+	unsigned int	oldlenp;
+	unsigned int	newval;
+	unsigned int	newlen;
+	unsigned int	__unused[4];
+};
+
+
+asmlinkage long
+sys32_sysctl(struct sysctl_ia32 __user *args32)
+{
+	struct sysctl_ia32 a32;
+	mm_segment_t old_fs = get_fs ();
+	void __user *oldvalp, *newvalp;
+	size_t oldlen;
+	int __user *namep;
+	long ret;
+	extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
+		     void *newval, size_t newlen);
+
+
+	if (copy_from_user(&a32, args32, sizeof (a32)))
+		return -EFAULT;
+
+	/*
+	 * We need to pre-validate these because we have to disable address checking
+	 * before calling do_sysctl() because of OLDLEN but we can't run the risk of the
+	 * user specifying bad addresses here.  Well, since we're dealing with 32 bit
+	 * addresses, we KNOW that access_ok() will always succeed, so this is an
+	 * expensive NOP, but so what...
+	 */
+	namep = compat_ptr(a32.name);
+	oldvalp = compat_ptr(a32.oldval);
+	newvalp =  compat_ptr(a32.newval);
+
+	if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
+	    || !access_ok(VERIFY_WRITE, namep, 0)
+	    || !access_ok(VERIFY_WRITE, oldvalp, 0)
+	    || !access_ok(VERIFY_WRITE, newvalp, 0))
+		return -EFAULT;
+
+	set_fs(KERNEL_DS);
+	lock_kernel();
+	ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen);
+	unlock_kernel();
+	set_fs(old_fs);
+
+	if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp)))
+		return -EFAULT;
+
+	return ret;
+}
+#endif
+
+/* warning: next two assume little endian */ 
+asmlinkage long
+sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
+{
+	return sys_pread64(fd, ubuf, count,
+			 ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+asmlinkage long
+sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
+{
+	return sys_pwrite64(fd, ubuf, count,
+			  ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+
+asmlinkage long
+sys32_personality(unsigned long personality)
+{
+	int ret;
+	if (personality(current->personality) == PER_LINUX32 && 
+		personality == PER_LINUX)
+		personality = PER_LINUX32;
+	ret = sys_personality(personality);
+	if (ret == PER_LINUX32)
+		ret = PER_LINUX;
+	return ret;
+}
+
+asmlinkage long
+sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
+{
+	mm_segment_t old_fs = get_fs();
+	int ret;
+	off_t of;
+	
+	if (offset && get_user(of, offset))
+		return -EFAULT;
+		
+	set_fs(KERNEL_DS);
+	ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count);
+	set_fs(old_fs);
+	
+	if (!ret && offset && put_user(of, offset))
+		return -EFAULT;
+		
+	return ret;
+}
+
+/* Handle adjtimex compatibility. */
+
+struct timex32 {
+	u32 modes;
+	s32 offset, freq, maxerror, esterror;
+	s32 status, constant, precision, tolerance;
+	struct compat_timeval time;
+	s32 tick;
+	s32 ppsfreq, jitter, shift, stabil;
+	s32 jitcnt, calcnt, errcnt, stbcnt;
+	s32  :32; s32  :32; s32  :32; s32  :32;
+	s32  :32; s32  :32; s32  :32; s32  :32;
+	s32  :32; s32  :32; s32  :32; s32  :32;
+};
+
+extern int do_adjtimex(struct timex *);
+
+asmlinkage long
+sys32_adjtimex(struct timex32 __user *utp)
+{
+	struct timex txc;
+	int ret;
+
+	memset(&txc, 0, sizeof(struct timex));
+
+	if (!access_ok(VERIFY_READ, utp, sizeof(struct timex32)) ||
+	   __get_user(txc.modes, &utp->modes) ||
+	   __get_user(txc.offset, &utp->offset) ||
+	   __get_user(txc.freq, &utp->freq) ||
+	   __get_user(txc.maxerror, &utp->maxerror) ||
+	   __get_user(txc.esterror, &utp->esterror) ||
+	   __get_user(txc.status, &utp->status) ||
+	   __get_user(txc.constant, &utp->constant) ||
+	   __get_user(txc.precision, &utp->precision) ||
+	   __get_user(txc.tolerance, &utp->tolerance) ||
+	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+	   __get_user(txc.tick, &utp->tick) ||
+	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
+	   __get_user(txc.jitter, &utp->jitter) ||
+	   __get_user(txc.shift, &utp->shift) ||
+	   __get_user(txc.stabil, &utp->stabil) ||
+	   __get_user(txc.jitcnt, &utp->jitcnt) ||
+	   __get_user(txc.calcnt, &utp->calcnt) ||
+	   __get_user(txc.errcnt, &utp->errcnt) ||
+	   __get_user(txc.stbcnt, &utp->stbcnt))
+		return -EFAULT;
+
+	ret = do_adjtimex(&txc);
+
+	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct timex32)) ||
+	   __put_user(txc.modes, &utp->modes) ||
+	   __put_user(txc.offset, &utp->offset) ||
+	   __put_user(txc.freq, &utp->freq) ||
+	   __put_user(txc.maxerror, &utp->maxerror) ||
+	   __put_user(txc.esterror, &utp->esterror) ||
+	   __put_user(txc.status, &utp->status) ||
+	   __put_user(txc.constant, &utp->constant) ||
+	   __put_user(txc.precision, &utp->precision) ||
+	   __put_user(txc.tolerance, &utp->tolerance) ||
+	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+	   __put_user(txc.tick, &utp->tick) ||
+	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
+	   __put_user(txc.jitter, &utp->jitter) ||
+	   __put_user(txc.shift, &utp->shift) ||
+	   __put_user(txc.stabil, &utp->stabil) ||
+	   __put_user(txc.jitcnt, &utp->jitcnt) ||
+	   __put_user(txc.calcnt, &utp->calcnt) ||
+	   __put_user(txc.errcnt, &utp->errcnt) ||
+	   __put_user(txc.stbcnt, &utp->stbcnt))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
+	unsigned long prot, unsigned long flags,
+	unsigned long fd, unsigned long pgoff)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long error;
+	struct file * file = NULL;
+
+	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	if (!(flags & MAP_ANONYMOUS)) {
+		file = fget(fd);
+		if (!file)
+			return -EBADF;
+	}
+
+	down_write(&mm->mmap_sem);
+	error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+	up_write(&mm->mmap_sem);
+
+	if (file)
+		fput(file);
+	return error;
+}
+
+asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
+{
+	int error;
+
+	if (!name)
+		return -EFAULT;
+	if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+		return -EFAULT;
+  
+  	down_read(&uts_sem);
+	
+	error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
+	 __put_user(0,name->sysname+__OLD_UTS_LEN);
+	 __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
+	 __put_user(0,name->nodename+__OLD_UTS_LEN);
+	 __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
+	 __put_user(0,name->release+__OLD_UTS_LEN);
+	 __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
+	 __put_user(0,name->version+__OLD_UTS_LEN);
+	 { 
+		 char *arch = "x86_64";
+		 if (personality(current->personality) == PER_LINUX32)
+			 arch = "i686";
+		 
+		 __copy_to_user(&name->machine,arch,strlen(arch)+1);
+	 }
+	
+	 up_read(&uts_sem);
+	 
+	 error = error ? -EFAULT : 0;
+	 
+	 return error;
+}
+
+long sys32_uname(struct old_utsname __user * name)
+{
+	int err;
+	if (!name)
+		return -EFAULT;
+	down_read(&uts_sem);
+	err=copy_to_user(name, &system_utsname, sizeof (*name));
+	up_read(&uts_sem);
+	if (personality(current->personality) == PER_LINUX32) 
+		err |= copy_to_user(&name->machine, "i686", 5);
+	return err?-EFAULT:0;
+}
+
+long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
+{
+	struct ustat u;
+	mm_segment_t seg;
+	int ret;
+	
+	seg = get_fs(); 
+	set_fs(KERNEL_DS); 
+	ret = sys_ustat(dev,&u); 
+	set_fs(seg);
+	if (ret >= 0) { 
+		if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || 
+		    __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
+		    __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
+		    __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
+		    __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
+			ret = -EFAULT;
+	}
+	return ret;
+} 
+
+asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+			     compat_uptr_t __user *envp, struct pt_regs *regs)
+{
+	long error;
+	char * filename;
+
+	filename = getname(name);
+	error = PTR_ERR(filename);
+	if (IS_ERR(filename))
+		return error;
+	error = compat_do_execve(filename, argv, envp, regs);
+	if (error == 0) {
+		task_lock(current);
+		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
+	putname(filename);
+	return error;
+}
+
+asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
+			    struct pt_regs *regs)
+{
+	void __user *parent_tid = (void __user *)regs->rdx;
+	void __user *child_tid = (void __user *)regs->rdi;
+	if (!newsp)
+		newsp = regs->rsp;
+        return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+
+/*
+ * Some system calls that need sign extended arguments. This could be done by a generic wrapper.
+ */ 
+
+long sys32_lseek (unsigned int fd, int offset, unsigned int whence)
+{
+	return sys_lseek(fd, offset, whence);
+}
+
+long sys32_kill(int pid, int sig)
+{
+	return sys_kill(pid, sig);
+}
+ 
+asmlinkage long sys32_open(const char __user * filename, int flags, int mode)
+{
+	char * tmp;
+	int fd, error;
+
+	/* don't force O_LARGEFILE */
+	tmp = getname(filename);
+	fd = PTR_ERR(tmp);
+	if (!IS_ERR(tmp)) {
+		fd = get_unused_fd();
+		if (fd >= 0) {
+			struct file *f = filp_open(tmp, flags, mode);
+			error = PTR_ERR(f);
+			if (IS_ERR(f)) {
+				put_unused_fd(fd); 
+				fd = error;
+			} else
+				fd_install(fd, f);
+		}
+		putname(tmp);
+	}
+	return fd;
+}
+
+extern asmlinkage long
+sys_timer_create(clockid_t which_clock,
+		 struct sigevent __user *timer_event_spec,
+		 timer_t __user * created_timer_id);
+
+long
+sys32_timer_create(u32 clock, struct compat_sigevent __user *se32, timer_t __user *timer_id)
+{
+	struct sigevent __user *p = NULL;
+	if (se32) { 
+		struct sigevent se;
+		p = compat_alloc_user_space(sizeof(struct sigevent));
+		if (get_compat_sigevent(&se, se32) ||
+		    copy_to_user(p, &se, sizeof(se)))
+			return -EFAULT;
+	} 
+	return sys_timer_create(clock, p, timer_id);
+} 
+
+long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, 
+			__u32 len_low, __u32 len_high, int advice)
+{ 
+	return sys_fadvise64_64(fd,
+			       (((u64)offset_high)<<32) | offset_low,
+			       (((u64)len_high)<<32) | len_low,
+			       advice); 
+} 
+
+long sys32_vm86_warning(void)
+{ 
+	struct task_struct *me = current;
+	static char lastcomm[sizeof(me->comm)];
+	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
+		printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
+		       me->comm);
+		strncpy(lastcomm, me->comm, sizeof(lastcomm));
+	} 
+	return -ENOSYS;
+} 
+
+long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
+			  char __user * buf, size_t len)
+{
+	return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
+}
+
+static int __init ia32_init (void)
+{
+	printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n");  
+	return 0;
+}
+
+__initcall(ia32_init);
+
+extern unsigned long ia32_sys_call_table[];
+EXPORT_SYMBOL(ia32_sys_call_table);
diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c
new file mode 100644
index 000000000000..399ff4985099
--- /dev/null
+++ b/arch/x86_64/ia32/syscall32.c
@@ -0,0 +1,111 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
+
+/* vsyscall handling for 32bit processes. Map a stub page into it 
+   on demand because 32bit cannot reach the kernel's fixmaps */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <asm/proto.h>
+#include <asm/tlbflush.h>
+#include <asm/ia32_unistd.h>
+
+/* 32bit VDSOs mapped into user space. */ 
+asm(".section \".init.data\",\"aw\"\n"
+    "syscall32_syscall:\n"
+    ".incbin \"arch/x86_64/ia32/vsyscall-syscall.so\"\n"
+    "syscall32_syscall_end:\n"
+    "syscall32_sysenter:\n"
+    ".incbin \"arch/x86_64/ia32/vsyscall-sysenter.so\"\n"
+    "syscall32_sysenter_end:\n"
+    ".previous");
+
+extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
+extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
+extern int sysctl_vsyscall32;
+
+char *syscall32_page; 
+static int use_sysenter = -1;
+
+/*
+ * Map the 32bit vsyscall page on demand.
+ *
+ * RED-PEN: This knows too much about high level VM.
+ *
+ * Alternative would be to generate a vma with appropriate backing options
+ * and let it be handled by generic VM.
+ */
+int __map_syscall32(struct mm_struct *mm, unsigned long address)
+{ 
+	pgd_t *pgd;
+	pud_t *pud;
+	pte_t *pte;
+	pmd_t *pmd;
+	int err = -ENOMEM;
+
+	spin_lock(&mm->page_table_lock); 
+ 	pgd = pgd_offset(mm, address);
+ 	pud = pud_alloc(mm, pgd, address);
+ 	if (pud) {
+ 		pmd = pmd_alloc(mm, pud, address);
+ 		if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) {
+ 			if (pte_none(*pte)) {
+ 				set_pte(pte,
+ 					mk_pte(virt_to_page(syscall32_page),
+ 					       PAGE_KERNEL_VSYSCALL32));
+ 			}
+ 			/* Flush only the local CPU. Other CPUs taking a fault
+ 			   will just end up here again
+			   This probably not needed and just paranoia. */
+ 			__flush_tlb_one(address);
+ 			err = 0;
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+	return err;
+}
+
+int map_syscall32(struct mm_struct *mm, unsigned long address)
+{
+	int err;
+	down_read(&mm->mmap_sem);
+	err = __map_syscall32(mm, address);
+	up_read(&mm->mmap_sem);
+	return err;
+}
+
+static int __init init_syscall32(void)
+{ 
+	syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); 
+	if (!syscall32_page) 
+		panic("Cannot allocate syscall32 page"); 
+	SetPageReserved(virt_to_page(syscall32_page));
+ 	if (use_sysenter > 0) {
+ 		memcpy(syscall32_page, syscall32_sysenter,
+ 		       syscall32_sysenter_end - syscall32_sysenter);
+ 	} else {
+  		memcpy(syscall32_page, syscall32_syscall,
+  		       syscall32_syscall_end - syscall32_syscall);
+  	}	
+	return 0;
+} 
+	
+__initcall(init_syscall32); 
+
+/* May not be __init: called during resume */
+void syscall32_cpu_init(void)
+{
+	if (use_sysenter < 0)
+ 		use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
+
+	/* Load these always in case some future AMD CPU supports
+	   SYSENTER from compat mode too. */
+	checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+	checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
+	checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+
+	wrmsrl(MSR_CSTAR, ia32_cstar_target);
+}
diff --git a/arch/x86_64/ia32/tls32.c b/arch/x86_64/ia32/tls32.c
new file mode 100644
index 000000000000..1cc4340de3ca
--- /dev/null
+++ b/arch/x86_64/ia32/tls32.c
@@ -0,0 +1,163 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/user.h>
+
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+	struct thread_struct *t = &current->thread;
+	int idx;
+
+	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+		if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
+			return idx + GDT_ENTRY_TLS_MIN;
+	return -ESRCH;
+}
+
+/*
+ * Set a given TLS descriptor:
+ * When you want addresses > 32bit use arch_prctl() 
+ */
+int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+{
+	struct user_desc info;
+	struct n_desc_struct *desc;
+	int cpu, idx;
+
+	if (copy_from_user(&info, u_info, sizeof(info)))
+		return -EFAULT;
+
+	idx = info.entry_number;
+
+	/*
+	 * index -1 means the kernel should try to find and
+	 * allocate an empty descriptor:
+	 */
+	if (idx == -1) {
+		idx = get_free_idx();
+		if (idx < 0)
+			return idx;
+		if (put_user(idx, &u_info->entry_number))
+			return -EFAULT;
+	}
+
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+
+	/*
+	 * We must not get preempted while modifying the TLS.
+	 */
+	cpu = get_cpu();
+
+	if (LDT_empty(&info)) {
+		desc->a = 0;
+		desc->b = 0;
+	} else {
+		desc->a = LDT_entry_a(&info);
+		desc->b = LDT_entry_b(&info);
+	}
+	if (t == &current->thread)
+		load_TLS(t, cpu);
+
+	put_cpu();
+	return 0;
+}
+
+asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
+{ 
+	return do_set_thread_area(&current->thread, u_info); 
+} 
+
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+#define GET_BASE(desc) ( \
+	(((desc)->a >> 16) & 0x0000ffff) | \
+	(((desc)->b << 16) & 0x00ff0000) | \
+	( (desc)->b        & 0xff000000)   )
+
+#define GET_LIMIT(desc) ( \
+	((desc)->a & 0x0ffff) | \
+	 ((desc)->b & 0xf0000) )
+	
+#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
+#define GET_LONGMODE(desc)	(((desc)->b >> 21) & 1)
+
+int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+{
+	struct user_desc info;
+	struct n_desc_struct *desc;
+	int idx;
+
+	if (get_user(idx, &u_info->entry_number))
+		return -EFAULT;
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+
+	memset(&info, 0, sizeof(struct user_desc));
+	info.entry_number = idx;
+	info.base_addr = GET_BASE(desc);
+	info.limit = GET_LIMIT(desc);
+	info.seg_32bit = GET_32BIT(desc);
+	info.contents = GET_CONTENTS(desc);
+	info.read_exec_only = !GET_WRITABLE(desc);
+	info.limit_in_pages = GET_LIMIT_PAGES(desc);
+	info.seg_not_present = !GET_PRESENT(desc);
+	info.useable = GET_USEABLE(desc);
+	info.lm = GET_LONGMODE(desc);
+
+	if (copy_to_user(u_info, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
+{
+	return do_get_thread_area(&current->thread, u_info);
+} 
+
+
+int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
+{
+	struct n_desc_struct *desc;
+	struct user_desc info;
+	struct user_desc __user *cp;
+	int idx;
+	
+	cp = (void __user *)childregs->rsi;
+	if (copy_from_user(&info, cp, sizeof(info)))
+		return -EFAULT;
+	if (LDT_empty(&info))
+		return -EINVAL;
+	
+	idx = info.entry_number;
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+	
+	desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
+	desc->a = LDT_entry_a(&info);
+	desc->b = LDT_entry_b(&info);
+
+	return 0;
+}
diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S
new file mode 100644
index 000000000000..ba4067d350e4
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-sigreturn.S
@@ -0,0 +1,120 @@
+/*
+ * Common code for the sigreturn entry points on the vsyscall page.
+ * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80)
+ * to enter the kernel.
+ * This file is #include'd by vsyscall-*.S to define them after the
+ * vsyscall entry point.  The addresses we get for these entry points
+ * by doing ".balign 32" must match in both versions of the page.
+ */
+
+	.section .text.sigreturn,"ax"
+	.balign 32
+	.globl __kernel_sigreturn
+	.type __kernel_sigreturn,@function
+__kernel_sigreturn:
+.LSTART_sigreturn:
+	popl %eax
+	movl $__NR_ia32_sigreturn, %eax
+	SYSCALL_ENTER_KERNEL
+.LEND_sigreturn:
+	.size __kernel_sigreturn,.-.LSTART_sigreturn
+
+	.section .text.rtsigreturn,"ax"
+	.balign 32
+	.globl __kernel_rt_sigreturn
+	.type __kernel_rt_sigreturn,@function
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+	movl $__NR_ia32_rt_sigreturn, %eax
+	SYSCALL_ENTER_KERNEL
+.LEND_rt_sigreturn:
+	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+
+	.section .eh_frame,"a",@progbits
+	.long .LENDFDE2-.LSTARTFDE2	/* Length FDE */
+.LSTARTFDE2:
+	.long .LSTARTFDE2-.LSTARTFRAME	/* CIE pointer */
+	/* HACK: The dwarf2 unwind routines will subtract 1 from the
+	   return address to get an address in the middle of the
+	   presumed call instruction.  Since we didn't get here via
+	   a call, we need to include the nop before the real start
+	   to make up for it.  */
+	.long .LSTART_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_sigreturn-.LSTART_sigreturn+1
+	.uleb128 0			/* Augmentation length */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   complicated by the fact that the "CFA" is always assumed to
+	   be the value of the stack pointer in the caller.  This means
+	   that we must define the CFA of this body of code to be the
+	   saved value of the stack pointer in the sigcontext.  Which
+	   also means that there is no fixed relation to the other 
+	   saved registers, which means that we must use DW_CFA_expression
+	   to compute their addresses.  It also means that when we 
+	   adjust the stack with the popl, we have to do it all over again.  */
+
+#define do_cfa_expr(offset)						\
+	.byte 0x0f;			/* DW_CFA_def_cfa_expression */	\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*      offset */		\
+	.byte 0x06;			/*     DW_OP_deref */		\
+1:
+
+#define do_expr(regno, offset)						\
+	.byte 0x10;			/* DW_CFA_expression */		\
+	.uleb128 regno;			/*   regno */			\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*       offset */		\
+1:
+
+	do_cfa_expr(IA32_SIGCONTEXT_esp+4)
+	do_expr(0, IA32_SIGCONTEXT_eax+4)
+	do_expr(1, IA32_SIGCONTEXT_ecx+4)
+	do_expr(2, IA32_SIGCONTEXT_edx+4)
+	do_expr(3, IA32_SIGCONTEXT_ebx+4)
+	do_expr(5, IA32_SIGCONTEXT_ebp+4)
+	do_expr(6, IA32_SIGCONTEXT_esi+4)
+	do_expr(7, IA32_SIGCONTEXT_edi+4)
+	do_expr(8, IA32_SIGCONTEXT_eip+4)
+
+	.byte 0x42	/* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+	do_cfa_expr(IA32_SIGCONTEXT_esp)
+	do_expr(0, IA32_SIGCONTEXT_eax)
+	do_expr(1, IA32_SIGCONTEXT_ecx)
+	do_expr(2, IA32_SIGCONTEXT_edx)
+	do_expr(3, IA32_SIGCONTEXT_ebx)
+	do_expr(5, IA32_SIGCONTEXT_ebp)
+	do_expr(6, IA32_SIGCONTEXT_esi)
+	do_expr(7, IA32_SIGCONTEXT_edi)
+	do_expr(8, IA32_SIGCONTEXT_eip)
+
+	.align 4
+.LENDFDE2:
+
+	.long .LENDFDE3-.LSTARTFDE3	/* Length FDE */
+.LSTARTFDE3:
+	.long .LSTARTFDE3-.LSTARTFRAME	/* CIE pointer */
+	/* HACK: See above wrt unwind library assumptions.  */
+	.long .LSTART_rt_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   slightly less complicated than the above, since we don't
+	   modify the stack pointer in the process.  */
+
+	do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
+	do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
+	do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
+	do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
+	do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
+	do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
+	do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
+	do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
+	do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
+
+	.align 4
+.LENDFDE3:
diff --git a/arch/x86_64/ia32/vsyscall-syscall.S b/arch/x86_64/ia32/vsyscall-syscall.S
new file mode 100644
index 000000000000..e2aaf3de8a42
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-syscall.S
@@ -0,0 +1,68 @@
+/*
+ * Code for the vsyscall page.  This version uses the syscall instruction.
+ */
+
+#include <asm/ia32_unistd.h>
+#include <asm/offset.h>
+#include <asm/segment.h>
+
+	.text
+	.section .text.vsyscall,"ax"
+	.globl __kernel_vsyscall
+	.type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+	push	%ebp
+.Lpush_ebp:
+	movl	%ecx, %ebp
+	syscall
+	movl	$__USER32_DS, %ecx
+	movl	%ecx, %ss
+	movl	%ebp, %ecx
+	popl	%ebp
+.Lpop_ebp:
+	ret
+.LEND_vsyscall:
+	.size __kernel_vsyscall,.-.LSTART_vsyscall
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAME:
+	.long .LENDCIE-.LSTARTCIE
+.LSTARTCIE:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0x0c		/* DW_CFA_def_cfa */
+	.uleb128 4
+	.uleb128 4
+	.byte 0x88		/* DW_CFA_offset, column 0x8 */
+	.uleb128 1
+	.align 4
+.LENDCIE:
+
+	.long .LENDFDE1-.LSTARTFDE1	/* Length FDE */
+.LSTARTFDE1:
+	.long .LSTARTFDE1-.LSTARTFRAME	/* CIE pointer */
+	.long .LSTART_vsyscall-.	/* PC-relative start address */
+	.long .LEND_vsyscall-.LSTART_vsyscall
+	.uleb128 0			/* Augmentation length */
+	/* What follows are the instructions for the table generation.
+	   We have to record all changes of the stack pointer.  */
+	.byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.uleb128 8
+	.byte 0x85, 0x02	/* DW_CFA_offset %ebp -8 */
+	.byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
+	.byte 0xc5		/* DW_CFA_restore %ebp */
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.uleb128 4
+	.align 4
+.LENDFDE1:
+
+#define SYSCALL_ENTER_KERNEL	syscall
+#include "vsyscall-sigreturn.S"
diff --git a/arch/x86_64/ia32/vsyscall-sysenter.S b/arch/x86_64/ia32/vsyscall-sysenter.S
new file mode 100644
index 000000000000..8fb8e0ff3afa
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-sysenter.S
@@ -0,0 +1,94 @@
+/*
+ * Code for the vsyscall page.  This version uses the sysenter instruction.
+ */
+
+#include <asm/ia32_unistd.h>
+#include <asm/offset.h>
+
+	.text
+	.section .text.vsyscall,"ax"
+	.globl __kernel_vsyscall
+	.type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+	push	%ecx
+.Lpush_ecx:
+	push	%edx
+.Lpush_edx:
+	push	%ebp
+.Lenter_kernel:
+	movl	%esp,%ebp
+	sysenter
+	.space 7,0x90
+	jmp	.Lenter_kernel
+	/* 16: System call normal return point is here! */
+	pop	%ebp
+.Lpop_ebp:
+	pop	%edx
+.Lpop_edx:
+	pop	%ecx
+.Lpop_ecx:
+	ret
+.LEND_vsyscall:
+	.size __kernel_vsyscall,.-.LSTART_vsyscall
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAME:
+	.long .LENDCIE-.LSTARTCIE
+.LSTARTCIE:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0x0c		/* DW_CFA_def_cfa */
+	.uleb128 4
+	.uleb128 4
+	.byte 0x88		/* DW_CFA_offset, column 0x8 */
+	.uleb128 1
+	.align 4
+.LENDCIE:
+
+	.long .LENDFDE1-.LSTARTFDE1	/* Length FDE */
+.LSTARTFDE1:
+	.long .LSTARTFDE1-.LSTARTFRAME	/* CIE pointer */
+	.long .LSTART_vsyscall-.	/* PC-relative start address */
+	.long .LEND_vsyscall-.LSTART_vsyscall
+	.uleb128 0			/* Augmentation length */
+	/* What follows are the instructions for the table generation.
+	   We have to record all changes of the stack pointer.  */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpush_ecx-.LSTART_vsyscall
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x08		/* RA at offset 8 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpush_edx-.Lpush_ecx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x0c		/* RA at offset 12 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lenter_kernel-.Lpush_edx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x10		/* RA at offset 16 now */
+	.byte 0x85, 0x04	/* DW_CFA_offset %ebp -16 */
+	/* Finally the epilogue.  */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_ebp-.Lenter_kernel
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x12		/* RA at offset 12 now */
+	.byte 0xc5		/* DW_CFA_restore %ebp */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_edx-.Lpop_ebp
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x08		/* RA at offset 8 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_ecx-.Lpop_edx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x04		/* RA at offset 4 now */
+	.align 4
+.LENDFDE1:
+
+#define SYSCALL_ENTER_KERNEL	int $0x80
+#include "vsyscall-sigreturn.S"
diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds
new file mode 100644
index 000000000000..fa4b4dd4a9ff
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall.lds
@@ -0,0 +1,77 @@
+/*
+ * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
+ * object prelinked to its virtual address. This script controls its layout.
+ */
+
+/* This must match <asm/fixmap.h>.  */
+VSYSCALL_BASE = 0xffffe000;
+
+SECTIONS
+{
+  . = VSYSCALL_BASE + SIZEOF_HEADERS;
+
+  .hash           : { *(.hash) }		:text
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+
+  /* This linker script is used both with -r and with -shared.
+     For the layouts to match, we need to skip more than enough
+     space for the dynamic symbol table et al.  If this amount
+     is insufficient, ld -shared will barf.  Just increase it here.  */
+  . = VSYSCALL_BASE + 0x400;
+  
+  .text.vsyscall   : { *(.text.vsyscall) } 	:text =0x90909090
+
+  /* This is an 32bit object and we cannot easily get the offsets
+     into the 64bit kernel. Just hardcode them here. This assumes
+     that all the stubs don't need more than 0x100 bytes. */
+  . = VSYSCALL_BASE + 0x500;
+
+  .text.sigreturn  : { *(.text.sigreturn) }	:text =0x90909090
+
+  . = VSYSCALL_BASE + 0x600;
+
+  .text.rtsigreturn : { *(.text.rtsigreturn) }   :text =0x90909090
+	
+  .eh_frame_hdr   : { *(.eh_frame_hdr) }	:text :eh_frame_hdr
+  .eh_frame       : { KEEP (*(.eh_frame)) }	:text
+  .dynamic        : { *(.dynamic) }		:text :dynamic
+  .useless        : {
+  	*(.got.plt) *(.got)
+	*(.data .data.* .gnu.linkonce.d.*)
+	*(.dynbss)
+	*(.bss .bss.* .gnu.linkonce.b.*)
+  }						:text
+}
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  LINUX_2.5 {
+    global:
+    	__kernel_vsyscall;
+    	__kernel_sigreturn;
+    	__kernel_rt_sigreturn;
+
+    local: *;
+  };
+}
+
+/* The ELF entry point can be used to set the AT_SYSINFO value.  */
+ENTRY(__kernel_vsyscall);
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
new file mode 100644
index 000000000000..0a3318e08ab6
--- /dev/null
+++ b/arch/x86_64/kernel/Makefile
@@ -0,0 +1,45 @@
+#
+# Makefile for the linux kernel.
+#
+
+extra-y 	:= head.o head64.o init_task.o vmlinux.lds
+EXTRA_AFLAGS	:= -traditional
+obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
+		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
+		x8664_ksyms.o i387.o syscall.o vsyscall.o \
+		setup64.o bootflag.o e820.o reboot.o quirks.o
+
+obj-$(CONFIG_X86_MCE)         += mce.o
+obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
+obj-$(CONFIG_MTRR)		+= ../../i386/kernel/cpu/mtrr/
+obj-$(CONFIG_ACPI_BOOT)		+= acpi/
+obj-$(CONFIG_X86_MSR)		+= msr.o
+obj-$(CONFIG_MICROCODE)		+= microcode.o
+obj-$(CONFIG_X86_CPUID)		+= cpuid.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o  nmi.o
+obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o mpparse.o \
+		genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_PM)		+= suspend.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
+obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
+obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
+obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
+obj-$(CONFIG_SWIOTLB)		+= swiotlb.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+
+obj-$(CONFIG_MODULES)		+= module.o
+
+obj-y				+= topology.o
+obj-y				+= intel_cacheinfo.o
+
+CFLAGS_vsyscall.o		:= $(PROFILING) -g0
+
+bootflag-y			+= ../../i386/kernel/bootflag.o
+cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
+topology-y                     += ../../i386/mach-default/topology.o
+swiotlb-$(CONFIG_SWIOTLB)      += ../../ia64/lib/swiotlb.o
+microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
+intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
+quirks-y			+= ../../i386/kernel/quirks.o
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile
new file mode 100644
index 000000000000..d2c2ee5f9a88
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_ACPI_BOOT)		:= boot.o
+boot-$(CONFIG_ACPI_BOOT)	:= ../../../i386/kernel/acpi/boot.o
+obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..7a275de6df22
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -0,0 +1,132 @@
+/*
+ *  acpi.c - Architecture-Specific Low-Level ACPI Support
+ *
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+ *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+ *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
+ *  Copyright (C) 2003 Pavel Machek, SuSE Labs
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/acpi.h>
+#include <asm/mpspec.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/tlbflush.h>
+
+
+/* --------------------------------------------------------------------------
+                              Low-Level Sleep Support
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI_SLEEP
+
+/* address in low memory of the wakeup routine. */
+unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_video_flags;
+extern char wakeup_start, wakeup_end;
+
+extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+
+static pgd_t low_ptr;
+
+static void init_low_mapping(void)
+{
+	pgd_t *slot0 = pgd_offset(current->mm, 0UL);
+	low_ptr = *slot0;
+	set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
+	flush_tlb_all();
+}
+
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_save_state_mem (void)
+{
+	init_low_mapping();
+
+	memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start);
+	acpi_copy_wakeup_routine(acpi_wakeup_address);
+
+	return 0;
+}
+
+/*
+ * acpi_restore_state
+ */
+void acpi_restore_state_mem (void)
+{
+	set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
+	flush_tlb_all();
+}
+
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page in low memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16M pages, but not
+ * <1M pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+	acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
+	if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
+		printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
+}
+
+static int __init acpi_sleep_setup(char *str)
+{
+	while ((str != NULL) && (*str != '\0')) {
+		if (strncmp(str, "s3_bios", 7) == 0)
+			acpi_video_flags = 1;
+		if (strncmp(str, "s3_mode", 7) == 0)
+			acpi_video_flags |= 2;
+		str = strchr(str, ',');
+		if (str != NULL)
+			str += strspn(str, ", \t");
+	}
+	return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
+
+#endif /*CONFIG_ACPI_SLEEP*/
+
+void acpi_pci_link_exit(void) {}
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
new file mode 100644
index 000000000000..a4c630034cd4
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -0,0 +1,527 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+
+# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+#
+# wakeup_code runs in real mode, and at unknown address (determined at run-time).
+# Therefore it must only use relative jumps/calls. 
+#
+# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
+#
+# If physical address of wakeup_code is 0x12345, BIOS should call us with
+# cs = 0x1234, eip = 0x05
+#
+
+
+ALIGN
+	.align	16
+ENTRY(wakeup_start)
+wakeup_code:
+	wakeup_code_start = .
+	.code16
+
+# Running in *copy* of this code, somewhere in low 1MB.
+
+	movb	$0xa1, %al	;  outb %al, $0x80
+	cli
+	cld
+	# setup data segment
+	movw	%cs, %ax
+	movw	%ax, %ds					# Make ds:0 point to wakeup_start
+	movw	%ax, %ss
+	mov	$(wakeup_stack - wakeup_code), %sp		# Private stack is needed for ASUS board
+
+	pushl	$0						# Kill any dangerous flags
+	popfl
+
+	movl	real_magic - wakeup_code, %eax
+	cmpl	$0x12345678, %eax
+	jne	bogus_real_magic
+
+	testl	$1, video_flags - wakeup_code
+	jz	1f
+	lcall   $0xc000,$3
+	movw	%cs, %ax
+	movw	%ax, %ds					# Bios might have played with that
+	movw	%ax, %ss
+1:
+
+	testl	$2, video_flags - wakeup_code
+	jz	1f
+	mov	video_mode - wakeup_code, %ax
+	call	mode_seta
+1:
+
+ 	movw	$0xb800, %ax
+	movw	%ax,%fs
+	movw	$0x0e00 + 'L', %fs:(0x10)
+
+	movb	$0xa2, %al	;  outb %al, $0x80
+	
+	lidt	%ds:idt_48a - wakeup_code
+	xorl	%eax, %eax
+	movw	%ds, %ax			# (Convert %ds:gdt to a linear ptr)
+	shll	$4, %eax
+	addl	$(gdta - wakeup_code), %eax
+	movl	%eax, gdt_48a +2 - wakeup_code
+	lgdt	%ds:gdt_48a - wakeup_code		# load gdt with whatever is
+						# appropriate
+
+	movl	$1, %eax			# protected mode (PE) bit
+	lmsw	%ax				# This is it!
+	jmp	1f
+1:
+
+	.byte 0x66, 0xea			# prefix + jmpi-opcode
+	.long	wakeup_32 - __START_KERNEL_map
+	.word	__KERNEL_CS
+
+	.code32
+wakeup_32:
+# Running in this code, but at low address; paging is not yet turned on.
+	movb	$0xa5, %al	;  outb %al, $0x80
+
+	/* Check if extended functions are implemented */		
+	movl	$0x80000000, %eax
+	cpuid
+	cmpl	$0x80000000, %eax
+	jbe	bogus_cpu
+	wbinvd
+	mov	$0x80000001, %eax
+	cpuid
+	btl	$29, %edx
+	jnc	bogus_cpu
+	movl	%edx,%edi
+	
+	movw	$__KERNEL_DS, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+	movw	%ax, %fs
+	movw	%ax, %gs
+
+	movw	$__KERNEL_DS, %ax	
+	movw	%ax, %ss
+
+	mov	$(wakeup_stack - __START_KERNEL_map), %esp
+	movl	saved_magic - __START_KERNEL_map, %eax
+	cmpl	$0x9abcdef0, %eax
+	jne	bogus_32_magic
+
+	/*
+	 * Prepare for entering 64bits mode
+	 */
+
+	/* Enable PAE mode and PGE */
+	xorl	%eax, %eax
+	btsl	$5, %eax
+	btsl	$7, %eax
+	movl	%eax, %cr4
+
+	/* Setup early boot stage 4 level pagetables */
+	movl	$(wakeup_level4_pgt - __START_KERNEL_map), %eax
+	movl	%eax, %cr3
+
+	/* Setup EFER (Extended Feature Enable Register) */
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	/* Fool rdmsr and reset %eax to avoid dependences */
+	xorl	%eax, %eax
+	/* Enable Long Mode */
+	btsl	$_EFER_LME, %eax
+	/* Enable System Call */
+	btsl	$_EFER_SCE, %eax
+
+	/* No Execute supported? */	
+	btl	$20,%edi
+	jnc     1f
+	btsl	$_EFER_NX, %eax
+1:	
+				
+	/* Make changes effective */
+	wrmsr
+	wbinvd
+
+	xorl	%eax, %eax
+	btsl	$31, %eax			/* Enable paging and in turn activate Long Mode */
+	btsl	$0, %eax			/* Enable protected mode */
+	btsl	$1, %eax			/* Enable MP */
+	btsl	$4, %eax			/* Enable ET */
+	btsl	$5, %eax			/* Enable NE */
+	btsl	$16, %eax			/* Enable WP */
+	btsl	$18, %eax			/* Enable AM */
+
+	/* Make changes effective */
+	movl	%eax, %cr0
+	/* At this point:
+		CR4.PAE must be 1
+		CS.L must be 0
+		CR3 must point to PML4
+		Next instruction must be a branch
+		This must be on identity-mapped page
+	*/
+	jmp	reach_compatibility_mode
+reach_compatibility_mode:
+	movw	$0x0e00 + 'i', %ds:(0xb8012)
+	movb	$0xa8, %al	;  outb %al, $0x80; 	
+		
+	/*
+	 * At this point we're in long mode but in 32bit compatibility mode
+	 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+	 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load
+	 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+	 */
+
+	movw	$0x0e00 + 'n', %ds:(0xb8014)
+	movb	$0xa9, %al	;  outb %al, $0x80
+	
+	/* Load new GDT with the 64bit segment using 32bit descriptor */
+	movl	$(pGDT32 - __START_KERNEL_map), %eax
+	lgdt	(%eax)
+
+	movl    $(wakeup_jumpvector - __START_KERNEL_map), %eax
+	/* Finally jump in 64bit mode */
+	ljmp	*(%eax)
+
+wakeup_jumpvector:
+	.long	wakeup_long64 - __START_KERNEL_map
+	.word	__KERNEL_CS
+
+.code64
+
+	/*	Hooray, we are in Long 64-bit mode (but still running in low memory) */
+wakeup_long64:
+	/*
+	 * We must switch to a new descriptor in kernel space for the GDT
+	 * because soon the kernel won't have access anymore to the userspace
+	 * addresses where we're currently running on. We have to do that here
+	 * because in 32bit we couldn't load a 64bit linear address.
+	 */
+	lgdt	cpu_gdt_descr - __START_KERNEL_map
+
+	movw	$0x0e00 + 'u', %ds:(0xb8016)
+	
+	nop
+	nop
+	movw	$__KERNEL_DS, %ax
+	movw	%ax, %ss	
+	movw	%ax, %ds
+	movw	%ax, %es
+	movw	%ax, %fs
+	movw	%ax, %gs
+	movq	saved_esp, %rsp
+
+	movw	$0x0e00 + 'x', %ds:(0xb8018)
+	movq	saved_ebx, %rbx
+	movq	saved_edi, %rdi
+	movq	saved_esi, %rsi
+	movq	saved_ebp, %rbp
+
+	movw	$0x0e00 + '!', %ds:(0xb801a)
+	movq	saved_eip, %rax
+	jmp	*%rax
+
+.code32
+
+	.align	64	
+gdta:
+	.word	0, 0, 0, 0			# dummy
+
+	.word	0, 0, 0, 0			# unused
+
+	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
+	.word	0				# base address = 0
+	.word	0x9B00				# code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?)
+	.word	0x00CF				# granularity = 4096, 386
+						#  (+5th nibble of limit)
+
+	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
+	.word	0				# base address = 0
+	.word	0x9200				# data read/write
+	.word	0x00CF				# granularity = 4096, 386
+						#  (+5th nibble of limit)
+# this is 64bit descriptor for code
+	.word	0xFFFF
+	.word	0
+	.word	0x9A00				# code read/exec
+	.word	0x00AF				# as above, but it is long mode and with D=0
+
+idt_48a:
+	.word	0				# idt limit = 0
+	.word	0, 0				# idt base = 0L
+
+gdt_48a:
+	.word	0x8000				# gdt limit=2048,
+						#  256 GDT entries
+	.word	0, 0				# gdt base (filled in later)
+	
+	
+real_save_gdt:	.word 0
+		.quad 0
+real_magic:	.quad 0
+video_mode:	.quad 0
+video_flags:	.quad 0
+
+bogus_real_magic:
+	movb	$0xba,%al	;  outb %al,$0x80		
+	jmp bogus_real_magic
+
+bogus_32_magic:
+	movb	$0xb3,%al	;  outb %al,$0x80
+	jmp bogus_32_magic
+
+bogus_31_magic:
+	movb	$0xb1,%al	;  outb %al,$0x80
+	jmp bogus_31_magic
+
+bogus_cpu:
+	movb	$0xbc,%al	;  outb %al,$0x80
+	jmp bogus_cpu
+
+	
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ *	NORMAL_VGA (-1)
+ *	EXTENDED_VGA (-2)
+ *	ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+
+# Setting of user mode (AX=mode ID) => CF=success
+mode_seta:
+	movw	%ax, %bx
+#if 0
+	cmpb	$0xff, %ah
+	jz	setalias
+
+	testb	$VIDEO_RECALC>>8, %ah
+	jnz	_setrec
+
+	cmpb	$VIDEO_FIRST_RESOLUTION>>8, %ah
+	jnc	setres
+	
+	cmpb	$VIDEO_FIRST_SPECIAL>>8, %ah
+	jz	setspc
+
+	cmpb	$VIDEO_FIRST_V7>>8, %ah
+	jz	setv7
+#endif
+	
+	cmpb	$VIDEO_FIRST_VESA>>8, %ah
+	jnc	check_vesaa
+#if 0	
+	orb	%ah, %ah
+	jz	setmenu
+#endif
+	
+	decb	%ah
+#	jz	setbios				  Add bios modes later
+
+setbada:	clc
+	ret
+
+check_vesaa:
+	subb	$VIDEO_FIRST_VESA>>8, %bh
+	orw	$0x4000, %bx			# Use linear frame buffer
+	movw	$0x4f02, %ax			# VESA BIOS mode set call
+	int	$0x10
+	cmpw	$0x004f, %ax			# AL=4f if implemented
+	jnz	_setbada				# AH=0 if OK
+
+	stc
+	ret
+
+_setbada: jmp setbada
+
+	.code64
+bogus_magic:
+	movw	$0x0e00 + 'B', %ds:(0xb8018)
+	jmp bogus_magic
+
+bogus_magic2:
+	movw	$0x0e00 + '2', %ds:(0xb8018)
+	jmp bogus_magic2
+	
+
+wakeup_stack_begin:	# Stack grows down
+
+.org	0xff0
+wakeup_stack:		# Just below end of page
+
+ENTRY(wakeup_end)
+	
+##
+# acpi_copy_wakeup_routine
+#
+# Copy the above routine to low memory.
+#
+# Parameters:
+# %rdi:	place to copy wakeup routine to
+#
+# Returned address is location of code in low memory (past data and stack)
+#
+ENTRY(acpi_copy_wakeup_routine)
+	pushq	%rax
+	pushq	%rcx
+	pushq	%rdx
+
+	sgdt	saved_gdt
+	sidt	saved_idt
+	sldt	saved_ldt
+	str	saved_tss
+
+	movq    %cr3, %rdx
+	movq    %rdx, saved_cr3
+	movq    %cr4, %rdx
+	movq    %rdx, saved_cr4
+	movq	%cr0, %rdx
+	movq	%rdx, saved_cr0
+	sgdt    real_save_gdt - wakeup_start (,%rdi)
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	movl	%eax, saved_efer
+	movl	%edx, saved_efer2
+
+	movl	saved_video_mode, %edx
+	movl	%edx, video_mode - wakeup_start (,%rdi)
+	movl	acpi_video_flags, %edx
+	movl	%edx, video_flags - wakeup_start (,%rdi)
+	movq	$0x12345678, real_magic - wakeup_start (,%rdi)
+	movq	$0x123456789abcdef0, %rdx
+	movq	%rdx, saved_magic
+
+	movl	saved_magic - __START_KERNEL_map, %eax
+	cmpl	$0x9abcdef0, %eax
+	jne	bogus_32_magic
+
+	# make sure %cr4 is set correctly (features, etc)
+	movl	saved_cr4 - __START_KERNEL_map, %eax
+	movq	%rax, %cr4
+
+	movl	saved_cr0 - __START_KERNEL_map, %eax
+	movq	%rax, %cr0
+	jmp	1f		# Flush pipelines
+1:
+	# restore the regs we used
+	popq	%rdx
+	popq	%rcx
+	popq	%rax
+ENTRY(do_suspend_lowlevel_s4bios)
+	ret
+
+	.align 2
+	.p2align 4,,15
+.globl do_suspend_lowlevel
+	.type	do_suspend_lowlevel,@function
+do_suspend_lowlevel:
+.LFB5:
+	subq	$8, %rsp
+	xorl	%eax, %eax
+	call	save_processor_state
+
+	movq %rsp, saved_context_esp(%rip)
+	movq %rax, saved_context_eax(%rip)
+	movq %rbx, saved_context_ebx(%rip)
+	movq %rcx, saved_context_ecx(%rip)
+	movq %rdx, saved_context_edx(%rip)
+	movq %rbp, saved_context_ebp(%rip)
+	movq %rsi, saved_context_esi(%rip)
+	movq %rdi, saved_context_edi(%rip)
+	movq %r8,  saved_context_r08(%rip)
+	movq %r9,  saved_context_r09(%rip)
+	movq %r10, saved_context_r10(%rip)
+	movq %r11, saved_context_r11(%rip)
+	movq %r12, saved_context_r12(%rip)
+	movq %r13, saved_context_r13(%rip)
+	movq %r14, saved_context_r14(%rip)
+	movq %r15, saved_context_r15(%rip)
+	pushfq ; popq saved_context_eflags(%rip)
+
+	movq	$.L97, saved_eip(%rip)
+
+	movq %rsp,saved_esp
+	movq %rbp,saved_ebp
+	movq %rbx,saved_ebx
+	movq %rdi,saved_edi
+	movq %rsi,saved_esi
+
+	addq	$8, %rsp
+	movl	$3, %edi
+	xorl	%eax, %eax
+	jmp	acpi_enter_sleep_state
+.L97:
+	.p2align 4,,7
+.L99:
+	.align 4
+	movl	$24, %eax
+	movw %ax, %ds
+	movq	saved_context+58(%rip), %rax
+	movq %rax, %cr4
+	movq	saved_context+50(%rip), %rax
+	movq %rax, %cr3
+	movq	saved_context+42(%rip), %rax
+	movq %rax, %cr2
+	movq	saved_context+34(%rip), %rax
+	movq %rax, %cr0
+	pushq saved_context_eflags(%rip) ; popfq
+	movq saved_context_esp(%rip), %rsp
+	movq saved_context_ebp(%rip), %rbp
+	movq saved_context_eax(%rip), %rax
+	movq saved_context_ebx(%rip), %rbx
+	movq saved_context_ecx(%rip), %rcx
+	movq saved_context_edx(%rip), %rdx
+	movq saved_context_esi(%rip), %rsi
+	movq saved_context_edi(%rip), %rdi
+	movq saved_context_r08(%rip), %r8
+	movq saved_context_r09(%rip), %r9
+	movq saved_context_r10(%rip), %r10
+	movq saved_context_r11(%rip), %r11
+	movq saved_context_r12(%rip), %r12
+	movq saved_context_r13(%rip), %r13
+	movq saved_context_r14(%rip), %r14
+	movq saved_context_r15(%rip), %r15
+
+	xorl	%eax, %eax
+	addq	$8, %rsp
+	jmp	restore_processor_state
+.LFE5:
+.Lfe5:
+	.size	do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel
+	
+.data
+ALIGN
+ENTRY(saved_ebp)	.quad	0
+ENTRY(saved_esi)	.quad	0
+ENTRY(saved_edi)	.quad	0
+ENTRY(saved_ebx)	.quad	0
+
+ENTRY(saved_eip)	.quad	0
+ENTRY(saved_esp)	.quad	0
+
+ENTRY(saved_magic)	.quad	0
+
+ALIGN
+# saved registers
+saved_gdt:	.quad	0,0
+saved_idt:	.quad	0,0
+saved_ldt:	.quad	0
+saved_tss:	.quad	0
+
+saved_cr0:	.quad 0
+saved_cr3:	.quad 0
+saved_cr4:	.quad 0
+saved_efer:	.quad 0
+saved_efer2:	.quad 0
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
new file mode 100644
index 000000000000..4baa99fe1e5c
--- /dev/null
+++ b/arch/x86_64/kernel/aperture.c
@@ -0,0 +1,286 @@
+/* 
+ * Firmware replacement code.
+ * 
+ * Work around broken BIOSes that don't set an aperture or only set the
+ * aperture in the AGP bridge. 
+ * If all fails map the aperture over some low memory.  This is cheaper than 
+ * doing bounce buffering. The memory is lost. This is done at early boot 
+ * because only the bootmem allocator can allocate 32+MB. 
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/pci-direct.h>
+
+int iommu_aperture;
+int iommu_aperture_disabled __initdata = 0;
+int iommu_aperture_allowed __initdata = 0;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata = 0; 
+
+int fix_aperture __initdata = 1;
+
+/* This code runs before the PCI subsystem is initialized, so just 
+   access the northbridge directly. */
+
+#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
+
+static u32 __init allocate_aperture(void) 
+{
+#ifdef CONFIG_DISCONTIGMEM
+	pg_data_t *nd0 = NODE_DATA(0);
+#else
+	pg_data_t *nd0 = &contig_page_data;
+#endif	
+	u32 aper_size;
+	void *p; 
+
+	if (fallback_aper_order > 7) 
+		fallback_aper_order = 7; 
+	aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
+
+	/* 
+	 * Aperture has to be naturally aligned. This means an 2GB aperture won't 
+	 * have much chances to find a place in the lower 4GB of memory. 
+	 * Unfortunately we cannot move it up because that would make the 
+	 * IOMMU useless.
+	 */
+	p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); 
+	if (!p || __pa(p)+aper_size > 0xffffffff) {
+		printk("Cannot allocate aperture memory hole (%p,%uK)\n",
+		       p, aper_size>>10);
+		if (p)
+			free_bootmem_node(nd0, (unsigned long)p, aper_size); 
+		return 0;
+	}
+	printk("Mapping aperture over %d KB of RAM @ %lx\n",  
+	       aper_size >> 10, __pa(p)); 
+	return (u32)__pa(p); 
+}
+
+static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) 
+{ 
+	if (!aper_base) 
+		return 0;
+	if (aper_size < 64*1024*1024) { 
+		printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); 
+		return 0;
+	}
+	if (aper_base + aper_size >= 0xffffffff) { 
+		printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
+		return 0; 
+	}
+	if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) {  
+		printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
+		return 0; 
+	} 
+	return 1;
+} 
+
+/* Find a PCI capability */ 
+static __u32 __init find_cap(int num, int slot, int func, int cap) 
+{ 
+	u8 pos;
+	int bytes;
+	if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+		return 0;
+	pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
+	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+		u8 id;
+		pos &= ~3; 
+		id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+		if (id == 0xff)
+			break;
+		if (id == cap) 
+			return pos; 
+		pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
+	} 
+	return 0;
+} 
+
+/* Read a standard AGPv3 bridge header */
+static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+{ 
+	u32 apsize;
+	u32 apsizereg;
+	int nbits;
+	u32 aper_low, aper_hi;
+	u64 aper;
+
+	printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+	apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+	if (apsizereg == 0xffffffff) {
+		printk("APSIZE in AGP bridge unreadable\n");
+		return 0;
+	}
+
+	apsize = apsizereg & 0xfff;
+	/* Some BIOS use weird encodings not in the AGPv3 table. */
+	if (apsize & 0xff) 
+		apsize |= 0xf00; 
+	nbits = hweight16(apsize);
+	*order = 7 - nbits;
+	if ((int)*order < 0) /* < 32MB */
+		*order = 0;
+	
+	aper_low = read_pci_config(num,slot,func, 0x10);
+	aper_hi = read_pci_config(num,slot,func,0x14);
+	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
+	       aper, 32 << *order, apsizereg);
+
+	if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order))
+	    return 0;
+	return (u32)aper; 
+} 
+
+/* Look for an AGP bridge. Windows only expects the aperture in the
+   AGP bridge and some BIOS forget to initialize the Northbridge too.
+   Work around this here. 
+
+   Do an PCI bus scan by hand because we're running before the PCI
+   subsystem. 
+
+   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+   generically. It's probably overkill to always scan all slots because
+   the AGP bridges should be always an own bus on the HT hierarchy, 
+   but do it here for future safety. */
+static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+	int num, slot, func;
+
+	/* Poor man's PCI discovery */
+	for (num = 0; num < 32; num++) { 
+		for (slot = 0; slot < 32; slot++) { 
+			for (func = 0; func < 8; func++) { 
+				u32 class, cap;
+				u8 type;
+				class = read_pci_config(num,slot,func,
+							PCI_CLASS_REVISION);
+				if (class == 0xffffffff)
+					break; 
+				
+				switch (class >> 16) { 
+				case PCI_CLASS_BRIDGE_HOST:
+				case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+					/* AGP bridge? */
+					cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+					if (!cap)
+						break;
+					*valid_agp = 1; 
+					return read_agp(num,slot,func,cap,order);
+				} 
+				
+				/* No multi-function device? */
+				type = read_pci_config_byte(num,slot,func,
+							       PCI_HEADER_TYPE);
+				if (!(type & 0x80))
+					break;
+			} 
+		} 
+	}
+	printk("No AGP bridge found\n"); 
+	return 0;
+}
+
+void __init iommu_hole_init(void) 
+{ 
+	int fix, num; 
+	u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
+	u64 aper_base, last_aper_base = 0;
+	int valid_agp = 0;
+
+	if (iommu_aperture_disabled || !fix_aperture)
+		return;
+
+	printk("Checking aperture...\n"); 
+
+	fix = 0;
+	for (num = 24; num < 32; num++) {		
+		char name[30];
+		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
+			continue;	
+
+		iommu_aperture = 1; 
+
+		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
+		aper_size = (32 * 1024 * 1024) << aper_order; 
+		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+		aper_base <<= 25; 
+
+		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
+		       aper_base, aper_size>>20);
+		
+		sprintf(name, "northbridge cpu %d", num-24); 
+
+		if (!aperture_valid(name, aper_base, aper_size)) { 
+			fix = 1; 
+			break; 
+		}
+
+		if ((last_aper_order && aper_order != last_aper_order) ||
+		    (last_aper_base && aper_base != last_aper_base)) {
+			fix = 1;
+			break;
+		}
+		last_aper_order = aper_order;
+		last_aper_base = aper_base;
+	} 
+
+	if (!fix && !fallback_aper_force) 
+		return; 
+
+	if (!fallback_aper_force)
+		aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
+		
+	if (aper_alloc) { 
+		/* Got the aperture from the AGP bridge */
+	} else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) ||
+		   force_iommu ||
+		   valid_agp ||
+		   fallback_aper_force) { 
+		printk("Your BIOS doesn't leave a aperture memory hole\n");
+		printk("Please enable the IOMMU option in the BIOS setup\n");
+		printk("This costs you %d MB of RAM\n",
+		       32 << fallback_aper_order);
+
+		aper_order = fallback_aper_order;
+		aper_alloc = allocate_aperture();
+		if (!aper_alloc) { 
+			/* Could disable AGP and IOMMU here, but it's probably
+			   not worth it. But the later users cannot deal with
+			   bad apertures and turning on the aperture over memory
+			   causes very strange problems, so it's better to 
+			   panic early. */
+			panic("Not enough memory for aperture");
+		}
+	} else { 
+		return; 
+	} 
+
+	/* Fix up the north bridges */
+	for (num = 24; num < 32; num++) { 		
+		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
+			continue;	
+
+		/* Don't enable translation yet. That is done later. 
+		   Assume this BIOS didn't initialise the GART so 
+		   just overwrite all previous bits */ 
+		write_pci_config(0, num, 3, 0x90, aper_order<<1); 
+		write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
+	} 
+} 
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
new file mode 100644
index 000000000000..9826f03dc9a3
--- /dev/null
+++ b/arch/x86_64/kernel/apic.c
@@ -0,0 +1,1088 @@
+/*
+ *	Local APIC handling, local APIC timers
+ *
+ *	(c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *	Fixes
+ *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs;
+ *					thanks to Eric Gilmore
+ *					and Rolf G. Tews
+ *					for testing these extensively.
+ *	Maciej W. Rozycki	:	Various updates and fixes.
+ *	Mikael Pettersson	:	Power Management for UP-APIC.
+ *	Pavel Machek and
+ *	Mikael Pettersson	:	PM converted to driver model.
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/mach_apic.h>
+
+int apic_verbosity;
+
+int disable_apic_timer __initdata;
+
+/* Using APIC to generate smp_local_timer_interrupt? */
+int using_apic_timer = 0;
+
+static DEFINE_PER_CPU(int, prof_multiplier) = 1;
+static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
+static DEFINE_PER_CPU(int, prof_counter) = 1;
+
+static void apic_pm_activate(void);
+
+void enable_NMI_through_LVT0 (void * dummy)
+{
+	unsigned int v, ver;
+	
+	ver = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(ver);
+	v = APIC_DM_NMI;                        /* unmask and set to NMI */
+	apic_write_around(APIC_LVT0, v);
+}
+
+int get_maxlvt(void)
+{
+	unsigned int v, ver, maxlvt;
+
+	v = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(v);
+	maxlvt = GET_APIC_MAXLVT(v);
+	return maxlvt;
+}
+
+void clear_local_APIC(void)
+{
+	int maxlvt;
+	unsigned int v;
+
+	maxlvt = get_maxlvt();
+
+	/*
+	 * Masking an LVT entry on a P6 can trigger a local APIC error
+	 * if the vector is zero. Mask LVTERR first to prevent this.
+	 */
+	if (maxlvt >= 3) {
+		v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+		apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+	}
+	/*
+	 * Careful: we have to set masks only first to deassert
+	 * any level-triggered sources.
+	 */
+	v = apic_read(APIC_LVTT);
+	apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+	v = apic_read(APIC_LVT0);
+	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+	v = apic_read(APIC_LVT1);
+	apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+	if (maxlvt >= 4) {
+		v = apic_read(APIC_LVTPC);
+		apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+	}
+
+	/*
+	 * Clean APIC state for other OSs:
+	 */
+	apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
+	apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+	apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+	if (maxlvt >= 3)
+		apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+	if (maxlvt >= 4)
+		apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+	v = GET_APIC_VERSION(apic_read(APIC_LVR));
+	if (APIC_INTEGRATED(v)) {	/* !82489DX */
+		if (maxlvt > 3)		/* Due to Pentium errata 3AP and 11AP. */
+			apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+	}
+}
+
+void __init connect_bsp_APIC(void)
+{
+	if (pic_mode) {
+		/*
+		 * Do not trust the local APIC being empty at bootup.
+		 */
+		clear_local_APIC();
+		/*
+		 * PIC mode, enable APIC mode in the IMCR, i.e.
+		 * connect BSP's local APIC to INT and NMI lines.
+		 */
+		apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
+		outb(0x70, 0x22);
+		outb(0x01, 0x23);
+	}
+}
+
+void disconnect_bsp_APIC(void)
+{
+	if (pic_mode) {
+		/*
+		 * Put the board back into PIC mode (has an effect
+		 * only on certain older boards).  Note that APIC
+		 * interrupts, including IPIs, won't work beyond
+		 * this point!  The only exception are INIT IPIs.
+		 */
+		apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n");
+		outb(0x70, 0x22);
+		outb(0x00, 0x23);
+	}
+}
+
+void disable_local_APIC(void)
+{
+	unsigned int value;
+
+	clear_local_APIC();
+
+	/*
+	 * Disable APIC (implies clearing of registers
+	 * for 82489DX!).
+	 */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_SPIV_APIC_ENABLED;
+	apic_write_around(APIC_SPIV, value);
+}
+
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+	unsigned int reg0, reg1;
+
+	/*
+	 * The version register is read-only in a real APIC.
+	 */
+	reg0 = apic_read(APIC_LVR);
+	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+	apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+	reg1 = apic_read(APIC_LVR);
+	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+
+	/*
+	 * The two version reads above should print the same
+	 * numbers.  If the second one is different, then we
+	 * poke at a non-APIC.
+	 */
+	if (reg1 != reg0)
+		return 0;
+
+	/*
+	 * Check if the version looks reasonably.
+	 */
+	reg1 = GET_APIC_VERSION(reg0);
+	if (reg1 == 0x00 || reg1 == 0xff)
+		return 0;
+	reg1 = get_maxlvt();
+	if (reg1 < 0x02 || reg1 == 0xff)
+		return 0;
+
+	/*
+	 * The ID register is read/write in a real APIC.
+	 */
+	reg0 = apic_read(APIC_ID);
+	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+	apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+	reg1 = apic_read(APIC_ID);
+	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+	apic_write(APIC_ID, reg0);
+	if (reg1 != (reg0 ^ APIC_ID_MASK))
+		return 0;
+
+	/*
+	 * The next two are just to see if we have sane values.
+	 * They're only really relevant if we're in Virtual Wire
+	 * compatibility mode, but most boxes are anymore.
+	 */
+	reg0 = apic_read(APIC_LVT0);
+	apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
+	reg1 = apic_read(APIC_LVT1);
+	apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+
+	return 1;
+}
+
+void __init sync_Arb_IDs(void)
+{
+	/* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
+	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+	if (ver >= 0x14)	/* P4 or higher */
+		return;
+
+	/*
+	 * Wait for idle.
+	 */
+	apic_wait_icr_idle();
+
+	apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+	apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+				| APIC_DM_INIT);
+}
+
+extern void __error_in_apic_c (void);
+
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+	unsigned int value, ver;
+
+	/*
+	 * Don't do the setup now if we have a SMP BIOS as the
+	 * through-I/O-APIC virtual wire mode might be active.
+	 */
+	if (smp_found_config || !cpu_has_apic)
+		return;
+
+	value = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(value);
+
+	/*
+	 * Do not trust the local APIC being empty at bootup.
+	 */
+	clear_local_APIC();
+
+	/*
+	 * Enable APIC.
+	 */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_VECTOR_MASK;
+	value |= APIC_SPIV_APIC_ENABLED;
+	value |= APIC_SPIV_FOCUS_DISABLED;
+	value |= SPURIOUS_APIC_VECTOR;
+	apic_write_around(APIC_SPIV, value);
+
+	/*
+	 * Set up the virtual wire mode.
+	 */
+	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+	value = APIC_DM_NMI;
+	if (!APIC_INTEGRATED(ver))		/* 82489DX */
+		value |= APIC_LVT_LEVEL_TRIGGER;
+	apic_write_around(APIC_LVT1, value);
+}
+
+void __init setup_local_APIC (void)
+{
+	unsigned int value, ver, maxlvt;
+
+	/* Pound the ESR really hard over the head with a big hammer - mbligh */
+	if (esr_disable) {
+		apic_write(APIC_ESR, 0);
+		apic_write(APIC_ESR, 0);
+		apic_write(APIC_ESR, 0);
+		apic_write(APIC_ESR, 0);
+	}
+
+	value = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(value);
+
+	if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
+		__error_in_apic_c();
+
+	/*
+	 * Double-check whether this APIC is really registered.
+	 * This is meaningless in clustered apic mode, so we skip it.
+	 */
+	if (!apic_id_registered())
+		BUG();
+
+	/*
+	 * Intel recommends to set DFR, LDR and TPR before enabling
+	 * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+	 * document number 292116).  So here it goes...
+	 */
+	init_apic_ldr();
+
+	/*
+	 * Set Task Priority to 'accept all'. We never change this
+	 * later on.
+	 */
+	value = apic_read(APIC_TASKPRI);
+	value &= ~APIC_TPRI_MASK;
+	apic_write_around(APIC_TASKPRI, value);
+
+	/*
+	 * Now that we are all set up, enable the APIC
+	 */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_VECTOR_MASK;
+	/*
+	 * Enable APIC
+	 */
+	value |= APIC_SPIV_APIC_ENABLED;
+
+	/*
+	 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+	 * certain networking cards. If high frequency interrupts are
+	 * happening on a particular IOAPIC pin, plus the IOAPIC routing
+	 * entry is masked/unmasked at a high rate as well then sooner or
+	 * later IOAPIC line gets 'stuck', no more interrupts are received
+	 * from the device. If focus CPU is disabled then the hang goes
+	 * away, oh well :-(
+	 *
+	 * [ This bug can be reproduced easily with a level-triggered
+	 *   PCI Ne2000 networking cards and PII/PIII processors, dual
+	 *   BX chipset. ]
+	 */
+	/*
+	 * Actually disabling the focus CPU check just makes the hang less
+	 * frequent as it makes the interrupt distributon model be more
+	 * like LRU than MRU (the short-term load is more even across CPUs).
+	 * See also the comment in end_level_ioapic_irq().  --macro
+	 */
+#if 1
+	/* Enable focus processor (bit==0) */
+	value &= ~APIC_SPIV_FOCUS_DISABLED;
+#else
+	/* Disable focus processor (bit==1) */
+	value |= APIC_SPIV_FOCUS_DISABLED;
+#endif
+	/*
+	 * Set spurious IRQ vector
+	 */
+	value |= SPURIOUS_APIC_VECTOR;
+	apic_write_around(APIC_SPIV, value);
+
+	/*
+	 * Set up LVT0, LVT1:
+	 *
+	 * set up through-local-APIC on the BP's LINT0. This is not
+	 * strictly necessary in pure symmetric-IO mode, but sometimes
+	 * we delegate interrupts to the 8259A.
+	 */
+	/*
+	 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+	 */
+	value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+	if (!smp_processor_id() && (pic_mode || !value)) {
+		value = APIC_DM_EXTINT;
+		apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
+	} else {
+		value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+		apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
+	}
+	apic_write_around(APIC_LVT0, value);
+
+	/*
+	 * only the BP should see the LINT1 NMI signal, obviously.
+	 */
+	if (!smp_processor_id())
+		value = APIC_DM_NMI;
+	else
+		value = APIC_DM_NMI | APIC_LVT_MASKED;
+	if (!APIC_INTEGRATED(ver))		/* 82489DX */
+		value |= APIC_LVT_LEVEL_TRIGGER;
+	apic_write_around(APIC_LVT1, value);
+
+	if (APIC_INTEGRATED(ver) && !esr_disable) {		/* !82489DX */
+		unsigned oldvalue;
+		maxlvt = get_maxlvt();
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
+			apic_write(APIC_ESR, 0);
+		oldvalue = apic_read(APIC_ESR);
+		value = ERROR_APIC_VECTOR;      // enables sending errors
+		apic_write_around(APIC_LVTERR, value);
+		/*
+		 * spec says clear errors after enabling vector.
+		 */
+		if (maxlvt > 3)
+			apic_write(APIC_ESR, 0);
+		value = apic_read(APIC_ESR);
+		if (value != oldvalue)
+			apic_printk(APIC_VERBOSE,
+			"ESR value after enabling vector: %08x, after %08x\n",
+			oldvalue, value);
+	} else {
+		if (esr_disable)	
+			/* 
+			 * Something untraceble is creating bad interrupts on 
+			 * secondary quads ... for the moment, just leave the
+			 * ESR disabled - we can't do anything useful with the
+			 * errors anyway - mbligh
+			 */
+			apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n");
+		else 
+			apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n");
+	}
+
+	nmi_watchdog_default();
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		setup_apic_nmi_watchdog();
+	apic_pm_activate();
+}
+
+#ifdef CONFIG_PM
+
+static struct {
+	/* 'active' is true if the local APIC was enabled by us and
+	   not the BIOS; this signifies that we are also responsible
+	   for disabling it before entering apm/acpi suspend */
+	int active;
+	/* r/w apic fields */
+	unsigned int apic_id;
+	unsigned int apic_taskpri;
+	unsigned int apic_ldr;
+	unsigned int apic_dfr;
+	unsigned int apic_spiv;
+	unsigned int apic_lvtt;
+	unsigned int apic_lvtpc;
+	unsigned int apic_lvt0;
+	unsigned int apic_lvt1;
+	unsigned int apic_lvterr;
+	unsigned int apic_tmict;
+	unsigned int apic_tdcr;
+	unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, u32 state)
+{
+	unsigned long flags;
+
+	if (!apic_pm_state.active)
+		return 0;
+
+	apic_pm_state.apic_id = apic_read(APIC_ID);
+	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+	apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+	local_save_flags(flags);
+	local_irq_disable();
+	disable_local_APIC();
+	local_irq_restore(flags);
+	return 0;
+}
+
+static int lapic_resume(struct sys_device *dev)
+{
+	unsigned int l, h;
+	unsigned long flags;
+
+	if (!apic_pm_state.active)
+		return 0;
+
+	/* XXX: Pavel needs this for S3 resume, but can't explain why */
+	set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+
+	local_irq_save(flags);
+	rdmsr(MSR_IA32_APICBASE, l, h);
+	l &= ~MSR_IA32_APICBASE_BASE;
+	l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+	wrmsr(MSR_IA32_APICBASE, l, h);
+	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+	apic_write(APIC_ID, apic_pm_state.apic_id);
+	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+	apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+	apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+	apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+	apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+	local_irq_restore(flags);
+	return 0;
+}
+
+static struct sysdev_class lapic_sysclass = {
+	set_kset_name("lapic"),
+	.resume		= lapic_resume,
+	.suspend	= lapic_suspend,
+};
+
+static struct sys_device device_lapic = {
+	.id		= 0,
+	.cls		= &lapic_sysclass,
+};
+
+static void __init apic_pm_activate(void)
+{
+	apic_pm_state.active = 1;
+}
+
+static int __init init_lapic_sysfs(void)
+{
+	int error;
+	if (!cpu_has_apic)
+		return 0;
+	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+	error = sysdev_class_register(&lapic_sysclass);
+	if (!error)
+		error = sysdev_register(&device_lapic);
+	return error;
+}
+device_initcall(init_lapic_sysfs);
+
+#else	/* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif	/* CONFIG_PM */
+
+static int __init apic_set_verbosity(char *str)
+{
+	if (strcmp("debug", str) == 0)
+		apic_verbosity = APIC_DEBUG;
+	else if (strcmp("verbose", str) == 0)
+		apic_verbosity = APIC_VERBOSE;
+	else
+		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+				" use apic=verbose or apic=debug", str);
+
+	return 0;
+}
+
+__setup("apic=", apic_set_verbosity);
+
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.) 
+ */
+
+static int __init detect_init_APIC (void)
+{
+	if (!cpu_has_apic) {
+		printk(KERN_INFO "No local APIC present\n");
+		return -1;
+	}
+
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+	boot_cpu_id = 0;
+	return 0;
+}
+
+void __init init_apic_mappings(void)
+{
+	unsigned long apic_phys;
+
+	/*
+	 * If no local APIC can be found then set up a fake all
+	 * zeroes page to simulate the local APIC and another
+	 * one for the IO-APIC.
+	 */
+	if (!smp_found_config && detect_init_APIC()) {
+		apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+		apic_phys = __pa(apic_phys);
+	} else
+		apic_phys = mp_lapic_addr;
+
+	set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+	apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+
+	/*
+	 * Fetch the APIC ID of the BSP in case we have a
+	 * default configuration (or the MP table is broken).
+	 */
+	if (boot_cpu_id == -1U)
+		boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+
+#ifdef CONFIG_X86_IO_APIC
+	{
+		unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+		int i;
+
+		for (i = 0; i < nr_ioapics; i++) {
+			if (smp_found_config) {
+				ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+			} else {
+				ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+				ioapic_phys = __pa(ioapic_phys);
+			}
+			set_fixmap_nocache(idx, ioapic_phys);
+			apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
+					__fix_to_virt(idx), ioapic_phys);
+			idx++;
+		}
+	}
+#endif
+}
+
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+
+#define APIC_DIVISOR 16
+
+static void __setup_APIC_LVTT(unsigned int clocks)
+{
+	unsigned int lvtt_value, tmp_value, ver;
+
+	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+	if (!APIC_INTEGRATED(ver))
+		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+	apic_write_around(APIC_LVTT, lvtt_value);
+
+	/*
+	 * Divide PICLK by 16
+	 */
+	tmp_value = apic_read(APIC_TDCR);
+	apic_write_around(APIC_TDCR, (tmp_value
+				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+				| APIC_TDR_DIV_16);
+
+	apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+
+static void setup_APIC_timer(unsigned int clocks)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	/* For some reasons this doesn't work on Simics, so fake it for now */ 
+	if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { 
+	__setup_APIC_LVTT(clocks);
+		return;
+	} 
+
+	/* wait for irq slice */
+ 	if (vxtime.hpet_address) {
+ 		int trigger = hpet_readl(HPET_T0_CMP);
+ 		while (hpet_readl(HPET_COUNTER) >= trigger)
+ 			/* do nothing */ ;
+ 		while (hpet_readl(HPET_COUNTER) <  trigger)
+ 			/* do nothing */ ;
+ 	} else {
+		int c1, c2;
+		outb_p(0x00, 0x43);
+		c2 = inb_p(0x40);
+		c2 |= inb_p(0x40) << 8;
+	do {
+			c1 = c2;
+			outb_p(0x00, 0x43);
+			c2 = inb_p(0x40);
+			c2 |= inb_p(0x40) << 8;
+		} while (c2 - c1 < 300);
+	}
+
+	__setup_APIC_LVTT(clocks);
+
+	local_irq_restore(flags);
+}
+
+/*
+ * In this function we calibrate APIC bus clocks to the external
+ * timer. Unfortunately we cannot use jiffies and the timer irq
+ * to calibrate, since some later bootup code depends on getting
+ * the first irq? Ugh.
+ *
+ * We want to do the calibration only once since we
+ * want to have local timer irqs syncron. CPUs connected
+ * by the same APIC bus have the very same bus frequency.
+ * And we want to have irqs off anyways, no accidental
+ * APIC irq that way.
+ */
+
+#define TICK_COUNT 100000000
+
+static int __init calibrate_APIC_clock(void)
+{
+	int apic, apic_start, tsc, tsc_start;
+	int result;
+	/*
+	 * Put whatever arbitrary (but long enough) timeout
+	 * value into the APIC clock, we just want to get the
+	 * counter running for calibration.
+	 */
+	__setup_APIC_LVTT(1000000000);
+
+	apic_start = apic_read(APIC_TMCCT);
+	rdtscl(tsc_start);
+
+	do {
+		apic = apic_read(APIC_TMCCT);
+		rdtscl(tsc);
+	} while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT);
+
+	result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start);
+
+	printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
+		result / 1000 / 1000, result / 1000 % 1000);
+
+	return result * APIC_DIVISOR / HZ;
+}
+
+static unsigned int calibration_result;
+
+void __init setup_boot_APIC_clock (void)
+{
+	if (disable_apic_timer) { 
+		printk(KERN_INFO "Disabling APIC timer\n"); 
+		return; 
+	} 
+
+	printk(KERN_INFO "Using local APIC timer interrupts.\n");
+	using_apic_timer = 1;
+
+	local_irq_disable();
+
+	calibration_result = calibrate_APIC_clock();
+	/*
+	 * Now set up the timer for real.
+	 */
+	setup_APIC_timer(calibration_result);
+
+	local_irq_enable();
+}
+
+void __init setup_secondary_APIC_clock(void)
+{
+	local_irq_disable(); /* FIXME: Do we need this? --RR */
+	setup_APIC_timer(calibration_result);
+	local_irq_enable();
+}
+
+void __init disable_APIC_timer(void)
+{
+	if (using_apic_timer) {
+		unsigned long v;
+
+		v = apic_read(APIC_LVTT);
+		apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+	}
+}
+
+void enable_APIC_timer(void)
+{
+	if (using_apic_timer) {
+		unsigned long v;
+
+		v = apic_read(APIC_LVTT);
+		apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
+	}
+}
+
+/*
+ * the frequency of the profiling timer can be changed
+ * by writing a multiplier value into /proc/profile.
+ */
+int setup_profiling_timer(unsigned int multiplier)
+{
+	int i;
+
+	/*
+	 * Sanity check. [at least 500 APIC cycles should be
+	 * between APIC interrupts as a rule of thumb, to avoid
+	 * irqs flooding us]
+	 */
+	if ( (!multiplier) || (calibration_result/multiplier < 500))
+		return -EINVAL;
+
+	/* 
+	 * Set the new multiplier for each CPU. CPUs don't start using the
+	 * new values until the next timer interrupt in which they do process
+	 * accounting. At that time they also adjust their APIC timers
+	 * accordingly.
+	 */
+	for (i = 0; i < NR_CPUS; ++i)
+		per_cpu(prof_multiplier, i) = multiplier;
+
+	return 0;
+}
+
+#undef APIC_DIVISOR
+
+/*
+ * Local timer interrupt handler. It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+
+void smp_local_timer_interrupt(struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+
+	profile_tick(CPU_PROFILING, regs);
+	if (--per_cpu(prof_counter, cpu) <= 0) {
+		/*
+		 * The multiplier may have changed since the last time we got
+		 * to this point as a result of the user writing to
+		 * /proc/profile. In this case we need to adjust the APIC
+		 * timer accordingly.
+		 *
+		 * Interrupts are already masked off at this point.
+		 */
+		per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
+		if (per_cpu(prof_counter, cpu) != 
+		    per_cpu(prof_old_multiplier, cpu)) {
+			__setup_APIC_LVTT(calibration_result/
+					per_cpu(prof_counter, cpu));
+			per_cpu(prof_old_multiplier, cpu) =
+				per_cpu(prof_counter, cpu);
+		}
+
+#ifdef CONFIG_SMP
+		update_process_times(user_mode(regs));
+#endif
+	}
+
+	/*
+	 * We take the 'long' return path, and there every subsystem
+	 * grabs the appropriate locks (kernel lock/ irq lock).
+	 *
+	 * we might want to decouple profiling from the 'long path',
+	 * and do the profiling totally in assembly.
+	 *
+	 * Currently this isn't too much of an issue (performance wise),
+	 * we can take more than 100K local irqs per second on a 100 MHz P5.
+	 */
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+	/*
+	 * the NMI deadlock-detector uses this.
+	 */
+	add_pda(apic_timer_irqs, 1);
+
+	/*
+	 * NOTE! We'd better ACK the irq immediately,
+	 * because timer handling can be slow.
+	 */
+	ack_APIC_irq();
+	/*
+	 * update_process_times() expects us to have done irq_enter().
+	 * Besides, if we don't timer interrupts ignore the global
+	 * interrupt lock, which is the WrongThing (tm) to do.
+	 */
+	irq_enter();
+	smp_local_timer_interrupt(regs);
+	irq_exit();
+}
+
+/*
+ * oem_force_hpet_timer -- force HPET mode for some boxes.
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
+ */
+__init int oem_force_hpet_timer(void)
+{
+	int i, clusters, zeros;
+	unsigned id;
+	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+	bitmap_empty(clustermap, NUM_APIC_CLUSTERS);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		id = bios_cpu_apicid[i];
+		if (id != BAD_APICID)
+			__set_bit(APIC_CLUSTERID(id), clustermap);
+	}
+
+	/* Problem:  Partially populated chassis may not have CPUs in some of
+	 * the APIC clusters they have been allocated.  Only present CPUs have
+	 * bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
+	 * clusters are allocated sequentially, count zeros only if they are
+	 * bounded by ones.
+	 */
+	clusters = 0;
+	zeros = 0;
+	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+		if (test_bit(i, clustermap)) {
+			clusters += 1 + zeros;
+			zeros = 0;
+		} else
+			++zeros;
+	}
+
+	/*
+	 * If clusters > 2, then should be multi-chassis.  Return 1 for HPET.
+	 * Else return 0 to use TSC.
+	 * May have to revisit this when multi-core + hyperthreaded CPUs come
+	 * out, but AFAIK this will work even for them.
+	 */
+	return (clusters > 2);
+}
+
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_spurious_interrupt(void)
+{
+	unsigned int v;
+	irq_enter();
+	/*
+	 * Check if this really is a spurious interrupt and ACK it
+	 * if it is a vectored one.  Just in case...
+	 * Spurious interrupts should not be ACKed.
+	 */
+	v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+		ack_APIC_irq();
+
+#if 0
+	static unsigned long last_warning; 
+	static unsigned long skipped; 
+
+	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
+	if (time_before(last_warning+30*HZ,jiffies)) { 
+		printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
+		       smp_processor_id(), skipped);
+		last_warning = jiffies; 
+		skipped = 0;
+	} else { 
+		skipped++; 
+	} 
+#endif 
+	irq_exit();
+}
+
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+
+asmlinkage void smp_error_interrupt(void)
+{
+	unsigned int v, v1;
+
+	irq_enter();
+	/* First tickle the hardware, only then report what went on. -- REW */
+	v = apic_read(APIC_ESR);
+	apic_write(APIC_ESR, 0);
+	v1 = apic_read(APIC_ESR);
+	ack_APIC_irq();
+	atomic_inc(&irq_err_count);
+
+	/* Here is what the APIC error bits mean:
+	   0: Send CS error
+	   1: Receive CS error
+	   2: Send accept error
+	   3: Receive accept error
+	   4: Reserved
+	   5: Send illegal vector
+	   6: Received illegal vector
+	   7: Illegal register address
+	*/
+	printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+	        smp_processor_id(), v , v1);
+	irq_exit();
+}
+
+int disable_apic; 
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+	if (disable_apic) { 
+		printk(KERN_INFO "Apic disabled\n");
+		return -1; 
+	}
+	if (!cpu_has_apic) { 
+		disable_apic = 1;
+		printk(KERN_INFO "Apic disabled by BIOS\n");
+		return -1;
+	}
+
+	verify_local_APIC();
+
+	connect_bsp_APIC();
+
+	phys_cpu_present_map = physid_mask_of_physid(0);
+	apic_write_around(APIC_ID, boot_cpu_id);
+
+	setup_local_APIC();
+
+#ifdef CONFIG_X86_IO_APIC
+	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+			setup_IO_APIC();
+	else
+		nr_ioapics = 0;
+#endif
+	setup_boot_APIC_clock();
+
+	return 0;
+}
+
+static __init int setup_disableapic(char *str) 
+{ 
+	disable_apic = 1;
+	return 0;
+} 
+
+static __init int setup_nolapic(char *str) 
+{ 
+	disable_apic = 1;
+	return 0;
+} 
+
+static __init int setup_noapictimer(char *str) 
+{ 
+	disable_apic_timer = 1;
+	return 0;
+} 
+
+/* dummy parsing: see setup.c */
+
+__setup("disableapic", setup_disableapic); 
+__setup("nolapic", setup_nolapic);  /* same as disableapic, for compatibility */
+
+__setup("noapictimer", setup_noapictimer); 
+
+/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
new file mode 100644
index 000000000000..35b4c3fcbb37
--- /dev/null
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -0,0 +1,69 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+
+#include <linux/sched.h> 
+#include <linux/stddef.h>
+#include <linux/errno.h> 
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <asm/pda.h>
+#include <asm/processor.h>
+#include <asm/segment.h>
+#include <asm/thread_info.h>
+#include <asm/ia32.h>
+
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+int main(void)
+{
+#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
+	ENTRY(state);
+	ENTRY(flags); 
+	ENTRY(thread); 
+	ENTRY(pid);
+	BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
+	ENTRY(flags);
+	ENTRY(addr_limit);
+	ENTRY(preempt_count);
+	BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
+	ENTRY(kernelstack); 
+	ENTRY(oldrsp); 
+	ENTRY(pcurrent); 
+	ENTRY(irqrsp);
+	ENTRY(irqcount);
+	ENTRY(cpunumber);
+	ENTRY(irqstackptr);
+	BLANK();
+#undef ENTRY
+#ifdef CONFIG_IA32_EMULATION
+#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
+	ENTRY(eax);
+	ENTRY(ebx);
+	ENTRY(ecx);
+	ENTRY(edx);
+	ENTRY(esi);
+	ENTRY(edi);
+	ENTRY(ebp);
+	ENTRY(esp);
+	ENTRY(eip);
+	BLANK();
+#undef ENTRY
+	DEFINE(IA32_RT_SIGFRAME_sigcontext,
+	       offsetof (struct rt_sigframe32, uc.uc_mcontext));
+	BLANK();
+#endif
+	DEFINE(pbe_address, offsetof(struct pbe, address));
+	DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+	DEFINE(pbe_next, offsetof(struct pbe, next));
+	return 0;
+}
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig
new file mode 100644
index 000000000000..81f1562e5393
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Kconfig
@@ -0,0 +1,96 @@
+#
+# CPU Frequency scaling
+#
+
+menu "CPU Frequency scaling"
+
+source "drivers/cpufreq/Kconfig"
+
+if CPU_FREQ
+
+comment "CPUFreq processor drivers"
+
+config X86_POWERNOW_K8
+	tristate "AMD Opteron/Athlon64 PowerNow!"
+	select CPU_FREQ_TABLE
+	help
+	  This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>. 
+
+	  If in doubt, say N.
+
+config X86_POWERNOW_K8_ACPI
+	bool
+	depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
+	depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
+	default y
+
+config X86_SPEEDSTEP_CENTRINO
+	tristate "Intel Enhanced SpeedStep"
+	select CPU_FREQ_TABLE
+	depends on ACPI_PROCESSOR
+	help
+	  This adds the CPUFreq driver for Enhanced SpeedStep enabled
+	  mobile CPUs.  This means Intel Pentium M (Centrino) CPUs
+	  or 64bit enabled Intel Xeons.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_SPEEDSTEP_CENTRINO_ACPI
+	bool
+	depends on X86_SPEEDSTEP_CENTRINO
+	default y
+
+config X86_ACPI_CPUFREQ
+	tristate "ACPI Processor P-States driver"
+	depends on ACPI_PROCESSOR
+	help
+	  This driver adds a CPUFreq driver which utilizes the ACPI
+	  Processor Performance States.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+comment "shared options"
+
+config X86_ACPI_CPUFREQ_PROC_INTF
+        bool "/proc/acpi/processor/../performance interface (deprecated)"
+	depends on PROC_FS
+	depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI
+	help
+	  This enables the deprecated /proc/acpi/processor/../performance
+	  interface. While it is helpful for debugging, the generic,
+	  cross-architecture cpufreq interfaces should be used.
+
+	  If in doubt, say N.
+
+config X86_P4_CLOCKMOD
+	tristate "Intel Pentium 4 clock modulation"
+	depends on EMBEDDED
+	help
+	  This adds the clock modulation driver for Intel Pentium 4 / XEON
+	  processors.  When enabled it will lower CPU temperature by skipping
+	  clocks.
+
+	  This driver should be only used in exceptional
+	  circumstances when very low power is needed because it causes severe
+	  slowdowns and noticeable latencies.  Normally Speedstep should be used
+	  instead.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  Unless you are absolutely sure say N.
+
+
+config X86_SPEEDSTEP_LIB
+        tristate
+        default X86_P4_CLOCKMOD
+
+endif
+
+endmenu
+
diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile
new file mode 100644
index 000000000000..d8b593879224
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Makefile
@@ -0,0 +1,17 @@
+#
+# Reuse the i386 cpufreq drivers
+#
+
+SRCDIR := ../../../i386/kernel/cpu/cpufreq
+
+obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
+obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
+obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
+obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
+
+powernow-k8-objs := ${SRCDIR}/powernow-k8.o
+speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o
+acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o
+p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o
+speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
new file mode 100644
index 000000000000..657003e461e6
--- /dev/null
+++ b/arch/x86_64/kernel/e820.c
@@ -0,0 +1,513 @@
+/* 
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/bootsetup.h>
+
+extern char _end[];
+
+/* 
+ * PFN of last memory page.
+ */
+unsigned long end_pfn; 
+
+/* 
+ * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
+ * The direct mapping extends to end_pfn_map, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */ 
+unsigned long end_pfn_map; 
+
+/* 
+ * Last pfn which the user wants to use.
+ */
+unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;  
+
+extern struct resource code_resource, data_resource;
+
+/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
+static inline int bad_addr(unsigned long *addrp, unsigned long size)
+{ 
+	unsigned long addr = *addrp, last = addr + size; 
+
+	/* various gunk below that needed for SMP startup */
+	if (addr < 0x8000) { 
+		*addrp = 0x8000;
+		return 1; 
+	}
+
+	/* direct mapping tables of the kernel */
+	if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
+		*addrp = table_end << PAGE_SHIFT; 
+		return 1;
+	} 
+
+	/* initrd */ 
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
+	    addr < INITRD_START+INITRD_SIZE) { 
+		*addrp = INITRD_START + INITRD_SIZE; 
+		return 1;
+	} 
+#endif
+	/* kernel code + 640k memory hole (later should not be needed, but 
+	   be paranoid for now) */
+	if (last >= 640*1024 && addr < __pa_symbol(&_end)) { 
+		*addrp = __pa_symbol(&_end);
+		return 1;
+	}
+	/* XXX ramdisk image here? */ 
+	return 0;
+} 
+
+int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) 
+{ 
+	int i;
+	for (i = 0; i < e820.nr_map; i++) { 
+		struct e820entry *ei = &e820.map[i]; 
+		if (type && ei->type != type) 
+			continue;
+		if (ei->addr >= end || ei->addr + ei->size < start) 
+			continue; 
+		return 1; 
+	} 
+	return 0;
+}
+
+/* 
+ * Find a free area in a specific range. 
+ */ 
+unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
+{ 
+	int i; 
+	for (i = 0; i < e820.nr_map; i++) { 
+		struct e820entry *ei = &e820.map[i]; 
+		unsigned long addr = ei->addr, last; 
+		if (ei->type != E820_RAM) 
+			continue; 
+		if (addr < start) 
+			addr = start;
+		if (addr > ei->addr + ei->size) 
+			continue; 
+		while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
+			;
+		last = addr + size;
+		if (last > ei->addr + ei->size)
+			continue;
+		if (last > end) 
+			continue;
+		return addr; 
+	} 
+	return -1UL;		
+} 
+
+/* 
+ * Free bootmem based on the e820 table for a node.
+ */
+void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
+{
+	int i;
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i]; 
+		unsigned long last, addr;
+
+		if (ei->type != E820_RAM || 
+		    ei->addr+ei->size <= start || 
+		    ei->addr > end)
+			continue;
+
+		addr = round_up(ei->addr, PAGE_SIZE);
+		if (addr < start) 
+			addr = start;
+
+		last = round_down(ei->addr + ei->size, PAGE_SIZE); 
+		if (last >= end)
+			last = end; 
+
+		if (last > addr && last-addr >= PAGE_SIZE)
+			free_bootmem_node(pgdat, addr, last-addr);
+	}
+}
+
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+	int i;
+	unsigned long end_pfn = 0;
+	
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i]; 
+		unsigned long start, end;
+
+		start = round_up(ei->addr, PAGE_SIZE); 
+		end = round_down(ei->addr + ei->size, PAGE_SIZE); 
+		if (start >= end)
+			continue;
+		if (ei->type == E820_RAM) { 
+		if (end > end_pfn<<PAGE_SHIFT)
+			end_pfn = end>>PAGE_SHIFT;
+		} else { 
+			if (end > end_pfn_map<<PAGE_SHIFT) 
+				end_pfn_map = end>>PAGE_SHIFT;
+		} 
+	}
+
+	if (end_pfn > end_pfn_map) 
+		end_pfn_map = end_pfn;
+	if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
+		end_pfn_map = MAXMEM>>PAGE_SHIFT;
+	if (end_pfn > end_user_pfn)
+		end_pfn = end_user_pfn;
+	if (end_pfn > end_pfn_map) 
+		end_pfn = end_pfn_map; 
+
+	return end_pfn;	
+}
+
+/* 
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+	int i;
+	for (i = 0; i < e820.nr_map; i++) {
+		struct resource *res;
+		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+			continue;
+		res = alloc_bootmem_low(sizeof(struct resource));
+		switch (e820.map[i].type) {
+		case E820_RAM:	res->name = "System RAM"; break;
+		case E820_ACPI:	res->name = "ACPI Tables"; break;
+		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
+		default:	res->name = "reserved";
+		}
+		res->start = e820.map[i].addr;
+		res->end = res->start + e820.map[i].size - 1;
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		request_resource(&iomem_resource, res);
+		if (e820.map[i].type == E820_RAM) {
+			/*
+			 *  We don't know which RAM region contains kernel data,
+			 *  so we try it repeatedly and let the resource manager
+			 *  test it.
+			 */
+			request_resource(res, &code_resource);
+			request_resource(res, &data_resource);
+		}
+	}
+}
+
+/* 
+ * Add a memory region to the kernel e820 map.
+ */ 
+void __init add_memory_region(unsigned long start, unsigned long size, int type)
+{
+	int x = e820.nr_map;
+
+	if (x == E820MAX) {
+		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		return;
+	}
+
+	e820.map[x].addr = start;
+	e820.map[x].size = size;
+	e820.map[x].type = type;
+	e820.nr_map++;
+}
+
+void __init e820_print_map(char *who)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		printk(" %s: %016Lx - %016Lx ", who,
+			(unsigned long long) e820.map[i].addr,
+			(unsigned long long) (e820.map[i].addr + e820.map[i].size));
+		switch (e820.map[i].type) {
+		case E820_RAM:	printk("(usable)\n");
+				break;
+		case E820_RESERVED:
+				printk("(reserved)\n");
+				break;
+		case E820_ACPI:
+				printk("(ACPI data)\n");
+				break;
+		case E820_NVS:
+				printk("(ACPI NVS)\n");
+				break;
+		default:	printk("type %u\n", e820.map[i].type);
+				break;
+		}
+	}
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following 
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+	struct change_member {
+		struct e820entry *pbios; /* pointer to original bios entry */
+		unsigned long long addr; /* address for this change point */
+	};
+	static struct change_member change_point_list[2*E820MAX] __initdata;
+	static struct change_member *change_point[2*E820MAX] __initdata;
+	static struct e820entry *overlap_list[E820MAX] __initdata;
+	static struct e820entry new_bios[E820MAX] __initdata;
+	struct change_member *change_tmp;
+	unsigned long current_type, last_type;
+	unsigned long long last_addr;
+	int chgidx, still_changing;
+	int overlap_entries;
+	int new_bios_entry;
+	int old_nr, new_nr;
+	int i;
+
+	/*
+		Visually we're performing the following (1,2,3,4 = memory types)...
+
+		Sample memory map (w/overlaps):
+		   ____22__________________
+		   ______________________4_
+		   ____1111________________
+		   _44_____________________
+		   11111111________________
+		   ____________________33__
+		   ___________44___________
+		   __________33333_________
+		   ______________22________
+		   ___________________2222_
+		   _________111111111______
+		   _____________________11_
+		   _________________4______
+
+		Sanitized equivalent (no overlap):
+		   1_______________________
+		   _44_____________________
+		   ___1____________________
+		   ____22__________________
+		   ______11________________
+		   _________1______________
+		   __________3_____________
+		   ___________44___________
+		   _____________33_________
+		   _______________2________
+		   ________________1_______
+		   _________________4______
+		   ___________________2____
+		   ____________________33__
+		   ______________________4_
+	*/
+
+	/* if there's only one memory region, don't bother */
+	if (*pnr_map < 2)
+		return -1;
+
+	old_nr = *pnr_map;
+
+	/* bail out if we find any unreasonable addresses in bios map */
+	for (i=0; i<old_nr; i++)
+		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+			return -1;
+
+	/* create pointers for initial change-point information (for sorting) */
+	for (i=0; i < 2*old_nr; i++)
+		change_point[i] = &change_point_list[i];
+
+	/* record all known change-points (starting and ending addresses) */
+	chgidx = 0;
+	for (i=0; i < old_nr; i++)	{
+		change_point[chgidx]->addr = biosmap[i].addr;
+		change_point[chgidx++]->pbios = &biosmap[i];
+		change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+		change_point[chgidx++]->pbios = &biosmap[i];
+	}
+
+	/* sort change-point list by memory addresses (low -> high) */
+	still_changing = 1;
+	while (still_changing)	{
+		still_changing = 0;
+		for (i=1; i < 2*old_nr; i++)  {
+			/* if <current_addr> > <last_addr>, swap */
+			/* or, if current=<start_addr> & last=<end_addr>, swap */
+			if ((change_point[i]->addr < change_point[i-1]->addr) ||
+				((change_point[i]->addr == change_point[i-1]->addr) &&
+				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
+				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+			   )
+			{
+				change_tmp = change_point[i];
+				change_point[i] = change_point[i-1];
+				change_point[i-1] = change_tmp;
+				still_changing=1;
+			}
+		}
+	}
+
+	/* create a new bios memory map, removing overlaps */
+	overlap_entries=0;	 /* number of entries in the overlap table */
+	new_bios_entry=0;	 /* index for creating new bios map entries */
+	last_type = 0;		 /* start with undefined memory type */
+	last_addr = 0;		 /* start with 0 as last starting address */
+	/* loop through change-points, determining affect on the new bios map */
+	for (chgidx=0; chgidx < 2*old_nr; chgidx++)
+	{
+		/* keep track of all overlapping bios entries */
+		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+		{
+			/* add map entry to overlap list (> 1 entry implies an overlap) */
+			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+		}
+		else
+		{
+			/* remove entry from list (order independent, so swap with last) */
+			for (i=0; i<overlap_entries; i++)
+			{
+				if (overlap_list[i] == change_point[chgidx]->pbios)
+					overlap_list[i] = overlap_list[overlap_entries-1];
+			}
+			overlap_entries--;
+		}
+		/* if there are overlapping entries, decide which "type" to use */
+		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+		current_type = 0;
+		for (i=0; i<overlap_entries; i++)
+			if (overlap_list[i]->type > current_type)
+				current_type = overlap_list[i]->type;
+		/* continue building up new bios map based on this information */
+		if (current_type != last_type)	{
+			if (last_type != 0)	 {
+				new_bios[new_bios_entry].size =
+					change_point[chgidx]->addr - last_addr;
+				/* move forward only if the new size was non-zero */
+				if (new_bios[new_bios_entry].size != 0)
+					if (++new_bios_entry >= E820MAX)
+						break; 	/* no more space left for new bios entries */
+			}
+			if (current_type != 0)	{
+				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+				new_bios[new_bios_entry].type = current_type;
+				last_addr=change_point[chgidx]->addr;
+			}
+			last_type = current_type;
+		}
+	}
+	new_nr = new_bios_entry;   /* retain count for new bios entries */
+
+	/* copy new bios mapping into original location */
+	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+	*pnr_map = new_nr;
+
+	return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+	/* Only one memory region (or negative)? Ignore it */
+	if (nr_map < 2)
+		return -1;
+
+	do {
+		unsigned long start = biosmap->addr;
+		unsigned long size = biosmap->size;
+		unsigned long end = start + size;
+		unsigned long type = biosmap->type;
+
+		/* Overflow in 64 bits? Ignore the memory map. */
+		if (start > end)
+			return -1;
+
+		/*
+		 * Some BIOSes claim RAM in the 640k - 1M region.
+		 * Not right. Fix it up.
+		 * 
+		 * This should be removed on Hammer which is supposed to not
+		 * have non e820 covered ISA mappings there, but I had some strange
+		 * problems so it stays for now.  -AK
+		 */
+		if (type == E820_RAM) {
+			if (start < 0x100000ULL && end > 0xA0000ULL) {
+				if (start < 0xA0000ULL)
+					add_memory_region(start, 0xA0000ULL-start, type);
+				if (end <= 0x100000ULL)
+					continue;
+				start = 0x100000ULL;
+				size = end - start;
+			}
+		}
+
+		add_memory_region(start, size, type);
+	} while (biosmap++,--nr_map);
+	return 0;
+}
+
+void __init setup_memory_region(void)
+{
+	char *who = "BIOS-e820";
+
+	/*
+	 * Try to copy the BIOS-supplied E820-map.
+	 *
+	 * Otherwise fake a memory map; one section from 0k->640k,
+	 * the next section from 1mb->appropriate_mem_k
+	 */
+	sanitize_e820_map(E820_MAP, &E820_MAP_NR);
+	if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
+		unsigned long mem_size;
+
+		/* compare results from other methods and take the greater */
+		if (ALT_MEM_K < EXT_MEM_K) {
+			mem_size = EXT_MEM_K;
+			who = "BIOS-88";
+		} else {
+			mem_size = ALT_MEM_K;
+			who = "BIOS-e801";
+		}
+
+		e820.nr_map = 0;
+		add_memory_region(0, LOWMEMSIZE(), E820_RAM);
+		add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+  	}
+	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	e820_print_map(who);
+}
+
+void __init parse_memopt(char *p, char **from) 
+{ 
+	end_user_pfn = memparse(p, from);
+	end_user_pfn >>= PAGE_SHIFT;	
+} 
+
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
new file mode 100644
index 000000000000..750bcd0655dc
--- /dev/null
+++ b/arch/x86_64/kernel/early_printk.c
@@ -0,0 +1,220 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+/* Simple VGA output */
+
+#ifdef __i386__
+#define VGABASE		(__ISA_IO_base + 0xb8000)
+#else
+#define VGABASE		((void __iomem *)0xffffffff800b8000UL)
+#endif
+
+#define MAX_YPOS	25
+#define MAX_XPOS	80
+
+static int current_ypos = 1, current_xpos = 0; 
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+	char c;
+	int  i, k, j;
+
+	while ((c = *str++) != '\0' && n-- > 0) {
+		if (current_ypos >= MAX_YPOS) {
+			/* scroll 1 line up */
+			for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
+				for (i = 0; i < MAX_XPOS; i++) {
+					writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
+					       VGABASE + 2*(MAX_XPOS*j + i));
+				}
+			}
+			for (i = 0; i < MAX_XPOS; i++)
+				writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
+			current_ypos = MAX_YPOS-1;
+		}
+		if (c == '\n') {
+			current_xpos = 0;
+			current_ypos++;
+		} else if (c != '\r')  {
+			writew(((0x7 << 8) | (unsigned short) c),
+			       VGABASE + 2*(MAX_XPOS*current_ypos +
+						current_xpos++));
+			if (current_xpos >= MAX_XPOS) {
+				current_xpos = 0;
+				current_ypos++;
+			}
+		}
+	}
+}
+
+static struct console early_vga_console = {
+	.name =		"earlyvga",
+	.write =	early_vga_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ 
+
+int early_serial_base = 0x3f8;  /* ttyS0 */ 
+
+#define XMTRDY          0x20
+
+#define DLAB		0x80
+
+#define TXR             0       /*  Transmit register (WRITE) */
+#define RXR             0       /*  Receive register  (READ)  */
+#define IER             1       /*  Interrupt Enable          */
+#define IIR             2       /*  Interrupt ID              */
+#define FCR             2       /*  FIFO control              */
+#define LCR             3       /*  Line control              */
+#define MCR             4       /*  Modem control             */
+#define LSR             5       /*  Line Status               */
+#define MSR             6       /*  Modem Status              */
+#define DLL             0       /*  Divisor Latch Low         */
+#define DLH             1       /*  Divisor latch High        */
+
+static int early_serial_putc(unsigned char ch) 
+{ 
+	unsigned timeout = 0xffff; 
+	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 
+		cpu_relax();
+	outb(ch, early_serial_base + TXR);
+	return timeout ? 0 : -1;
+} 
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+	while (*s && n-- > 0) { 
+		early_serial_putc(*s); 
+		if (*s == '\n') 
+			early_serial_putc('\r'); 
+		s++; 
+	} 
+} 
+
+#define DEFAULT_BAUD 9600
+
+static __init void early_serial_init(char *s)
+{
+	unsigned char c; 
+	unsigned divisor;
+	unsigned baud = DEFAULT_BAUD;
+	char *e;
+
+	if (*s == ',')
+		++s;
+
+	if (*s) {
+		unsigned port; 
+		if (!strncmp(s,"0x",2)) {
+			early_serial_base = simple_strtoul(s, &e, 16);
+		} else {
+			static int bases[] = { 0x3f8, 0x2f8 };
+
+			if (!strncmp(s,"ttyS",4))
+				s += 4;
+			port = simple_strtoul(s, &e, 10);
+			if (port > 1 || s == e)
+				port = 0;
+			early_serial_base = bases[port];
+		}
+		s += strcspn(s, ",");
+		if (*s == ',')
+			s++;
+	}
+
+	outb(0x3, early_serial_base + LCR);	/* 8n1 */
+	outb(0, early_serial_base + IER);	/* no interrupt */
+	outb(0, early_serial_base + FCR);	/* no fifo */
+	outb(0x3, early_serial_base + MCR);	/* DTR + RTS */
+
+	if (*s) {
+		baud = simple_strtoul(s, &e, 0); 
+		if (baud == 0 || s == e) 
+			baud = DEFAULT_BAUD;
+	} 
+	
+	divisor = 115200 / baud; 
+	c = inb(early_serial_base + LCR); 
+	outb(c | DLAB, early_serial_base + LCR); 
+	outb(divisor & 0xff, early_serial_base + DLL); 
+	outb((divisor >> 8) & 0xff, early_serial_base + DLH); 
+	outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+static struct console early_serial_console = {
+	.name =		"earlyser",
+	.write =	early_serial_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/* Direct interface for emergencies */
+struct console *early_console = &early_vga_console;
+static int early_console_initialized = 0;
+
+void early_printk(const char *fmt, ...)
+{ 
+	char buf[512]; 
+	int n; 
+	va_list ap;
+
+	va_start(ap,fmt); 
+	n = vscnprintf(buf,512,fmt,ap);
+	early_console->write(early_console,buf,n);
+	va_end(ap); 
+} 
+
+static int keep_early; 
+
+int __init setup_early_printk(char *opt) 
+{  
+	char *space;
+	char buf[256]; 
+
+	if (early_console_initialized)
+		return -1;
+
+	opt = strchr(opt, '=') + 1;
+
+	strlcpy(buf,opt,sizeof(buf)); 
+	space = strchr(buf, ' '); 
+	if (space)
+		*space = 0; 
+
+	if (strstr(buf,"keep"))
+		keep_early = 1; 
+
+	if (!strncmp(buf, "serial", 6)) { 
+		early_serial_init(buf + 6);
+		early_console = &early_serial_console;
+	} else if (!strncmp(buf, "ttyS", 4)) { 
+		early_serial_init(buf);
+		early_console = &early_serial_console;		
+	} else if (!strncmp(buf, "vga", 3)) {
+		early_console = &early_vga_console; 
+	}
+	early_console_initialized = 1;
+	register_console(early_console);       
+	return 0;
+}
+
+void __init disable_early_printk(void)
+{ 
+	if (!early_console_initialized || !early_console)
+		return;
+	if (!keep_early) {
+		printk("disabling early console\n");
+		unregister_console(early_console);
+		early_console_initialized = 0;
+	} else { 
+		printk("keeping early console\n");
+	}
+} 
+
+__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
new file mode 100644
index 000000000000..e126284db7a8
--- /dev/null
+++ b/arch/x86_64/kernel/entry.S
@@ -0,0 +1,920 @@
+/*
+ *  linux/arch/x86_64/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ * 
+ *  $Id$
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ * 
+ * Normal syscalls and interrupts don't save a full stack frame, this is 
+ * only done for syscall tracing, signals or fork/exec et.al.
+ * 
+ * A note on terminology:	 
+ * - top of stack: Architecture defined interrupt frame from SS to RIP 
+ * at the top of the kernel process stack.	
+ * - partial stack frame: partially saved registers upto R11.
+ * - full stack frame: Like partial stack frame, but all register saved. 
+ *	
+ * TODO:	 
+ * - schedule it carefully for the final hardware.
+ */
+
+#define ASSEMBLY 1
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/offset.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+
+	.code64
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop cli
+#else
+#define preempt_stop
+#define retint_kernel retint_restore_args
+#endif	
+	
+/*
+ * C code is not supposed to know about undefined top of stack. Every time 
+ * a C function with an pt_regs argument is called from the SYSCALL based 
+ * fast path FIXUP_TOP_OF_STACK is needed.
+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
+ * manipulation.
+ */        	
+		
+	/* %rsp:at FRAMEEND */ 
+	.macro FIXUP_TOP_OF_STACK tmp
+	movq	%gs:pda_oldrsp,\tmp
+	movq  	\tmp,RSP(%rsp)
+	movq    $__USER_DS,SS(%rsp)
+	movq    $__USER_CS,CS(%rsp)
+	movq 	$-1,RCX(%rsp)
+	movq	R11(%rsp),\tmp  /* get eflags */
+	movq	\tmp,EFLAGS(%rsp)
+	.endm
+
+	.macro RESTORE_TOP_OF_STACK tmp,offset=0
+	movq   RSP-\offset(%rsp),\tmp
+	movq   \tmp,%gs:pda_oldrsp
+	movq   EFLAGS-\offset(%rsp),\tmp
+	movq   \tmp,R11-\offset(%rsp)
+	.endm
+
+	.macro FAKE_STACK_FRAME child_rip
+	/* push in order ss, rsp, eflags, cs, rip */
+	xorq %rax, %rax
+	pushq %rax /* ss */
+	CFI_ADJUST_CFA_OFFSET	8
+	pushq %rax /* rsp */
+	CFI_ADJUST_CFA_OFFSET	8
+	CFI_OFFSET	rip,0
+	pushq $(1<<9) /* eflags - interrupts on */
+	CFI_ADJUST_CFA_OFFSET	8
+	pushq $__KERNEL_CS /* cs */
+	CFI_ADJUST_CFA_OFFSET	8
+	pushq \child_rip /* rip */
+	CFI_ADJUST_CFA_OFFSET	8
+	CFI_OFFSET	rip,0
+	pushq	%rax /* orig rax */
+	CFI_ADJUST_CFA_OFFSET	8
+	.endm
+
+	.macro UNFAKE_STACK_FRAME
+	addq $8*6, %rsp
+	CFI_ADJUST_CFA_OFFSET	-(6*8)
+	.endm
+
+	.macro	CFI_DEFAULT_STACK
+	CFI_ADJUST_CFA_OFFSET  (SS)
+	CFI_OFFSET	r15,R15-SS
+	CFI_OFFSET	r14,R14-SS
+	CFI_OFFSET	r13,R13-SS
+	CFI_OFFSET	r12,R12-SS
+	CFI_OFFSET	rbp,RBP-SS
+	CFI_OFFSET	rbx,RBX-SS
+	CFI_OFFSET	r11,R11-SS
+	CFI_OFFSET	r10,R10-SS
+	CFI_OFFSET	r9,R9-SS
+	CFI_OFFSET	r8,R8-SS
+	CFI_OFFSET	rax,RAX-SS
+	CFI_OFFSET	rcx,RCX-SS
+	CFI_OFFSET	rdx,RDX-SS
+	CFI_OFFSET	rsi,RSI-SS
+	CFI_OFFSET	rdi,RDI-SS
+	CFI_OFFSET	rsp,RSP-SS
+	CFI_OFFSET	rip,RIP-SS
+	.endm
+/*
+ * A newly forked process directly context switches into this.
+ */ 	
+/* rdi:	prev */	
+ENTRY(ret_from_fork)
+	CFI_STARTPROC
+	CFI_DEFAULT_STACK
+	call schedule_tail
+	GET_THREAD_INFO(%rcx)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	jnz rff_trace
+rff_action:	
+	RESTORE_REST
+	testl $3,CS-ARGOFFSET(%rsp)	# from kernel_thread?
+	je   int_ret_from_sys_call
+	testl $_TIF_IA32,threadinfo_flags(%rcx)
+	jnz  int_ret_from_sys_call
+	RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
+	jmp ret_from_sys_call
+rff_trace:
+	movq %rsp,%rdi
+	call syscall_trace_leave
+	GET_THREAD_INFO(%rcx)	
+	jmp rff_action
+	CFI_ENDPROC
+
+/*
+ * System call entry. Upto 6 arguments in registers are supported.
+ *
+ * SYSCALL does not save anything on the stack and does not change the
+ * stack pointer.
+ */
+		
+/*
+ * Register setup:	
+ * rax  system call number
+ * rdi  arg0
+ * rcx  return address for syscall/sysret, C arg3 
+ * rsi  arg1
+ * rdx  arg2	
+ * r10  arg3 	(--> moved to rcx for C)
+ * r8   arg4
+ * r9   arg5
+ * r11  eflags for syscall/sysret, temporary for C
+ * r12-r15,rbp,rbx saved by C code, not touched. 		
+ * 
+ * Interrupts are off on entry.
+ * Only called from user space.
+ *
+ * XXX	if we had a free scratch register we could save the RSP into the stack frame
+ *      and report it properly in ps. Unfortunately we haven't.
+ */ 			 		
+
+ENTRY(system_call)
+	CFI_STARTPROC
+	swapgs
+	movq	%rsp,%gs:pda_oldrsp 
+	movq	%gs:pda_kernelstack,%rsp
+	sti					
+	SAVE_ARGS 8,1
+	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
+	movq  %rcx,RIP-ARGOFFSET(%rsp)  
+	GET_THREAD_INFO(%rcx)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+	jnz tracesys
+	cmpq $__NR_syscall_max,%rax
+	ja badsys
+	movq %r10,%rcx
+	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
+	movq %rax,RAX-ARGOFFSET(%rsp)
+/*
+ * Syscall return path ending with SYSRET (fast path)
+ * Has incomplete stack frame and undefined top of stack. 
+ */		
+	.globl ret_from_sys_call
+ret_from_sys_call:
+	movl $_TIF_WORK_MASK,%edi
+	/* edi:	flagmask */
+sysret_check:		
+	GET_THREAD_INFO(%rcx)
+	cli
+	movl threadinfo_flags(%rcx),%edx
+	andl %edi,%edx
+	jnz  sysret_careful 
+	movq RIP-ARGOFFSET(%rsp),%rcx
+	RESTORE_ARGS 0,-ARG_SKIP,1
+	movq	%gs:pda_oldrsp,%rsp
+	swapgs
+	sysretq
+
+	/* Handle reschedules */
+	/* edx:	work, edi: workmask */	
+sysret_careful:
+	bt $TIF_NEED_RESCHED,%edx
+	jnc sysret_signal
+	sti
+	pushq %rdi
+	call schedule
+	popq  %rdi
+	jmp sysret_check
+
+	/* Handle a signal */ 
+sysret_signal:
+	sti
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	jz    1f
+
+	/* Really a signal */
+	/* edx:	work flags (arg3) */
+	leaq do_notify_resume(%rip),%rax
+	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
+	xorl %esi,%esi # oldset -> arg2
+	call ptregscall_common
+1:	movl $_TIF_NEED_RESCHED,%edi
+	jmp sysret_check
+	
+	/* Do syscall tracing */
+tracesys:			 
+	SAVE_REST
+	movq $-ENOSYS,RAX(%rsp)
+	FIXUP_TOP_OF_STACK %rdi
+	movq %rsp,%rdi
+	call syscall_trace_enter
+	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	cmpq $__NR_syscall_max,%rax
+	ja  1f
+	movq %r10,%rcx	/* fixup for C */
+	call *sys_call_table(,%rax,8)
+	movq %rax,RAX-ARGOFFSET(%rsp)
+1:	SAVE_REST
+	movq %rsp,%rdi
+	call syscall_trace_leave
+	RESTORE_TOP_OF_STACK %rbx
+	RESTORE_REST
+	jmp ret_from_sys_call
+		
+badsys:
+	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)	
+	jmp ret_from_sys_call
+
+/* 
+ * Syscall return path ending with IRET.
+ * Has correct top of stack, but partial stack frame.
+ */ 	
+ENTRY(int_ret_from_sys_call)	
+	cli
+	testl $3,CS-ARGOFFSET(%rsp)
+	je retint_restore_args
+	movl $_TIF_ALLWORK_MASK,%edi
+	/* edi:	mask to check */
+int_with_check:
+	GET_THREAD_INFO(%rcx)
+	movl threadinfo_flags(%rcx),%edx
+	andl %edi,%edx
+	jnz   int_careful
+	jmp   retint_swapgs
+
+	/* Either reschedule or signal or syscall exit tracking needed. */
+	/* First do a reschedule test. */
+	/* edx:	work, edi: workmask */
+int_careful:
+	bt $TIF_NEED_RESCHED,%edx
+	jnc  int_very_careful
+	sti
+	pushq %rdi
+	call schedule
+	popq %rdi
+	jmp int_with_check
+
+	/* handle signals and tracing -- both require a full stack frame */
+int_very_careful:
+	sti
+	SAVE_REST
+	/* Check for syscall exit trace */	
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+	jz int_signal
+	pushq %rdi
+	leaq 8(%rsp),%rdi	# &ptregs -> arg1	
+	call syscall_trace_leave
+	popq %rdi
+	btr  $TIF_SYSCALL_TRACE,%edi
+	btr  $TIF_SYSCALL_AUDIT,%edi
+	btr  $TIF_SINGLESTEP,%edi
+	jmp int_restore_rest
+	
+int_signal:
+	testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
+	jz 1f
+	movq %rsp,%rdi		# &ptregs -> arg1
+	xorl %esi,%esi		# oldset -> arg2
+	call do_notify_resume
+1:	movl $_TIF_NEED_RESCHED,%edi	
+int_restore_rest:
+	RESTORE_REST
+	jmp int_with_check
+	CFI_ENDPROC
+		
+/* 
+ * Certain special system calls that need to save a complete full stack frame.
+ */ 								
+	
+	.macro PTREGSCALL label,func,arg
+	.globl \label
+\label:
+	leaq	\func(%rip),%rax
+	leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
+	jmp	ptregscall_common
+	.endm
+
+	PTREGSCALL stub_clone, sys_clone, %r8
+	PTREGSCALL stub_fork, sys_fork, %rdi
+	PTREGSCALL stub_vfork, sys_vfork, %rdi
+	PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
+	PTREGSCALL stub_iopl, sys_iopl, %rsi
+
+ENTRY(ptregscall_common)
+	CFI_STARTPROC
+	popq %r11
+	CFI_ADJUST_CFA_OFFSET	-8
+	SAVE_REST
+	movq %r11, %r15
+	FIXUP_TOP_OF_STACK %r11
+	call *%rax
+	RESTORE_TOP_OF_STACK %r11
+	movq %r15, %r11
+	RESTORE_REST
+	pushq %r11
+	CFI_ADJUST_CFA_OFFSET	8
+	ret
+	CFI_ENDPROC
+	
+ENTRY(stub_execve)
+	CFI_STARTPROC
+	popq %r11
+	CFI_ADJUST_CFA_OFFSET	-8
+	SAVE_REST
+	movq %r11, %r15
+	FIXUP_TOP_OF_STACK %r11
+	call sys_execve
+	GET_THREAD_INFO(%rcx)
+	bt $TIF_IA32,threadinfo_flags(%rcx)
+	jc exec_32bit
+	RESTORE_TOP_OF_STACK %r11
+	movq %r15, %r11
+	RESTORE_REST
+	push %r11
+	ret
+
+exec_32bit:
+	CFI_ADJUST_CFA_OFFSET	REST_SKIP
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+	
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */                
+ENTRY(stub_rt_sigreturn)
+	CFI_STARTPROC
+	addq $8, %rsp		
+	SAVE_REST
+	movq %rsp,%rdi
+	FIXUP_TOP_OF_STACK %r11
+	call sys_rt_sigreturn
+	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+
+/* 
+ * Interrupt entry/exit.
+ *
+ * Interrupt entry points save only callee clobbered registers in fast path.
+ *	
+ * Entry runs with interrupts off.	
+ */ 
+
+/* 0(%rsp): interrupt number */ 
+	.macro interrupt func
+	CFI_STARTPROC	simple
+	CFI_DEF_CFA	rsp,(SS-RDI)
+	CFI_REL_OFFSET	rsp,(RSP-ORIG_RAX)
+	CFI_REL_OFFSET	rip,(RIP-ORIG_RAX)
+	cld
+#ifdef CONFIG_DEBUG_INFO
+	SAVE_ALL	
+	movq %rsp,%rdi
+	/*
+	 * Setup a stack frame pointer.  This allows gdb to trace
+	 * back to the original stack.
+	 */
+	movq %rsp,%rbp
+	CFI_DEF_CFA_REGISTER	rbp
+#else		
+	SAVE_ARGS
+	leaq -ARGOFFSET(%rsp),%rdi	# arg1 for handler
+#endif	
+	testl $3,CS(%rdi)
+	je 1f
+	swapgs	
+1:	addl $1,%gs:pda_irqcount	# RED-PEN should check preempt count
+	movq %gs:pda_irqstackptr,%rax
+	cmoveq %rax,%rsp							
+	pushq %rdi			# save old stack	
+	call \func
+	.endm
+
+ENTRY(common_interrupt)
+	interrupt do_IRQ
+	/* 0(%rsp): oldrsp-ARGOFFSET */
+ret_from_intr:		
+	popq  %rdi
+	cli	
+	subl $1,%gs:pda_irqcount
+#ifdef CONFIG_DEBUG_INFO
+	movq RBP(%rdi),%rbp
+#endif
+	leaq ARGOFFSET(%rdi),%rsp
+exit_intr:	 	
+	GET_THREAD_INFO(%rcx)
+	testl $3,CS-ARGOFFSET(%rsp)
+	je retint_kernel
+	
+	/* Interrupt came from user space */
+	/*
+	 * Has a correct top of stack, but a partial stack frame
+	 * %rcx: thread info. Interrupts off.
+	 */		
+retint_with_reschedule:
+	movl $_TIF_WORK_MASK,%edi
+retint_check:			
+	movl threadinfo_flags(%rcx),%edx
+	andl %edi,%edx
+	jnz  retint_careful
+retint_swapgs:	 	
+	cli
+	swapgs 
+retint_restore_args:				
+	cli
+	RESTORE_ARGS 0,8,0						
+iret_label:	
+	iretq
+
+	.section __ex_table,"a"
+	.quad iret_label,bad_iret	
+	.previous
+	.section .fixup,"ax"
+	/* force a signal here? this matches i386 behaviour */
+	/* running with kernel gs */
+bad_iret:
+	movq $-9999,%rdi	/* better code? */
+	jmp do_exit			
+	.previous	
+	
+	/* edi: workmask, edx: work */	
+retint_careful:
+	bt    $TIF_NEED_RESCHED,%edx
+	jnc   retint_signal
+	sti
+	pushq %rdi
+	call  schedule
+	popq %rdi		
+	GET_THREAD_INFO(%rcx)
+	cli
+	jmp retint_check
+	
+retint_signal:
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	jz    retint_swapgs
+	sti
+	SAVE_REST
+	movq $-1,ORIG_RAX(%rsp) 			
+	xorq %rsi,%rsi		# oldset
+	movq %rsp,%rdi		# &pt_regs
+	call do_notify_resume
+	RESTORE_REST
+	cli
+	movl $_TIF_NEED_RESCHED,%edi
+	GET_THREAD_INFO(%rcx)	
+	jmp retint_check
+
+#ifdef CONFIG_PREEMPT
+	/* Returning to kernel space. Check if we need preemption */
+	/* rcx:	 threadinfo. interrupts off. */
+	.p2align
+retint_kernel:	
+	cmpl $0,threadinfo_preempt_count(%rcx)
+	jnz  retint_restore_args
+	bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+	jnc  retint_restore_args
+	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
+	jnc  retint_restore_args
+	call preempt_schedule_irq
+	jmp exit_intr
+#endif	
+	CFI_ENDPROC
+	
+/*
+ * APIC interrupts.
+ */		
+	.macro apicinterrupt num,func
+	pushq $\num-256
+	interrupt \func
+	jmp ret_from_intr
+	CFI_ENDPROC
+	.endm
+
+ENTRY(thermal_interrupt)
+	apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+
+#ifdef CONFIG_SMP	
+ENTRY(reschedule_interrupt)
+	apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
+
+ENTRY(invalidate_interrupt)
+	apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt
+
+ENTRY(call_function_interrupt)
+	apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC	
+ENTRY(apic_timer_interrupt)
+	apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
+
+ENTRY(error_interrupt)
+	apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
+
+ENTRY(spurious_interrupt)
+	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
+#endif
+				
+/*
+ * Exception entry points.
+ */ 		
+	.macro zeroentry sym
+	pushq $0	/* push error code/oldrax */ 
+	pushq %rax	/* push real oldrax to the rdi slot */ 
+	leaq  \sym(%rip),%rax
+	jmp error_entry
+	.endm	
+
+	.macro errorentry sym
+	pushq %rax
+	leaq  \sym(%rip),%rax
+	jmp error_entry
+	.endm
+
+	/* error code is on the stack already */
+	/* handle NMI like exceptions that can happen everywhere */
+	.macro paranoidentry sym
+	SAVE_ALL
+	cld
+	movl $1,%ebx
+	movl  $MSR_GS_BASE,%ecx
+	rdmsr
+	testl %edx,%edx
+	js    1f
+	swapgs
+	xorl  %ebx,%ebx
+1:	movq %rsp,%rdi
+	movq ORIG_RAX(%rsp),%rsi
+	movq $-1,ORIG_RAX(%rsp)
+	call \sym
+	.endm
+	
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack
+ * and the exception handler in %rax.	
+ */ 		  				
+ENTRY(error_entry)
+	CFI_STARTPROC	simple
+	CFI_DEF_CFA	rsp,(SS-RDI)
+	CFI_REL_OFFSET	rsp,(RSP-RDI)
+	CFI_REL_OFFSET	rip,(RIP-RDI)
+	/* rdi slot contains rax, oldrax contains error code */
+	cld	
+	subq  $14*8,%rsp
+	CFI_ADJUST_CFA_OFFSET	(14*8)
+	movq %rsi,13*8(%rsp)
+	CFI_REL_OFFSET	rsi,RSI
+	movq 14*8(%rsp),%rsi	/* load rax from rdi slot */
+	movq %rdx,12*8(%rsp)
+	CFI_REL_OFFSET	rdx,RDX
+	movq %rcx,11*8(%rsp)
+	CFI_REL_OFFSET	rcx,RCX
+	movq %rsi,10*8(%rsp)	/* store rax */ 
+	CFI_REL_OFFSET	rax,RAX
+	movq %r8, 9*8(%rsp)
+	CFI_REL_OFFSET	r8,R8
+	movq %r9, 8*8(%rsp)
+	CFI_REL_OFFSET	r9,R9
+	movq %r10,7*8(%rsp)
+	CFI_REL_OFFSET	r10,R10
+	movq %r11,6*8(%rsp)
+	CFI_REL_OFFSET	r11,R11
+	movq %rbx,5*8(%rsp) 
+	CFI_REL_OFFSET	rbx,RBX
+	movq %rbp,4*8(%rsp) 
+	CFI_REL_OFFSET	rbp,RBP
+	movq %r12,3*8(%rsp) 
+	CFI_REL_OFFSET	r12,R12
+	movq %r13,2*8(%rsp) 
+	CFI_REL_OFFSET	r13,R13
+	movq %r14,1*8(%rsp) 
+	CFI_REL_OFFSET	r14,R14
+	movq %r15,(%rsp) 
+	CFI_REL_OFFSET	r15,R15
+	xorl %ebx,%ebx	
+	testl $3,CS(%rsp)
+	je  error_kernelspace
+error_swapgs:	
+	swapgs
+error_sti:	
+	movq %rdi,RDI(%rsp) 	
+	movq %rsp,%rdi
+	movq ORIG_RAX(%rsp),%rsi	/* get error code */ 
+	movq $-1,ORIG_RAX(%rsp)
+	call *%rax
+	/* ebx:	no swapgs flag (1: don't need swapgs, 0: need it) */	 
+error_exit:		
+	movl %ebx,%eax		
+	RESTORE_REST
+	cli
+	GET_THREAD_INFO(%rcx)	
+	testl %eax,%eax
+	jne  retint_kernel
+	movl  threadinfo_flags(%rcx),%edx
+	movl  $_TIF_WORK_MASK,%edi
+	andl  %edi,%edx
+	jnz  retint_careful
+	swapgs 
+	RESTORE_ARGS 0,8,0						
+	iretq
+	CFI_ENDPROC
+
+error_kernelspace:
+	incl %ebx
+       /* There are two places in the kernel that can potentially fault with
+          usergs. Handle them here. The exception handlers after
+	   iret run with kernel gs again, so don't set the user space flag.
+	   B stepping K8s sometimes report an truncated RIP for IRET 
+	   exceptions returning to compat mode. Check for these here too. */
+	leaq iret_label(%rip),%rbp
+	cmpq %rbp,RIP(%rsp) 
+	je   error_swapgs
+	movl %ebp,%ebp	/* zero extend */
+	cmpq %rbp,RIP(%rsp) 
+	je   error_swapgs
+	cmpq $gs_change,RIP(%rsp)
+        je   error_swapgs
+	jmp  error_sti
+	
+       /* Reload gs selector with exception handling */
+       /* edi:  new selector */ 
+ENTRY(load_gs_index)
+	pushf
+	cli
+        swapgs
+gs_change:     
+        movl %edi,%gs   
+2:	mfence		/* workaround */
+	swapgs
+        popf
+        ret
+       
+        .section __ex_table,"a"
+        .align 8
+        .quad gs_change,bad_gs
+        .previous
+        .section .fixup,"ax"
+	/* running with kernelgs */
+bad_gs: 
+	swapgs			/* switch back to user gs */
+	xorl %eax,%eax
+        movl %eax,%gs
+        jmp  2b
+        .previous       
+	
+/*
+ * Create a kernel thread.
+ *
+ * C extern interface:
+ *	extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ *
+ * asm input arguments:
+ *	rdi: fn, rsi: arg, rdx: flags
+ */
+ENTRY(kernel_thread)
+	CFI_STARTPROC
+	FAKE_STACK_FRAME $child_rip
+	SAVE_ALL
+
+	# rdi: flags, rsi: usp, rdx: will be &pt_regs
+	movq %rdx,%rdi
+	orq  kernel_thread_flags(%rip),%rdi
+	movq $-1, %rsi
+	movq %rsp, %rdx
+
+	xorl %r8d,%r8d
+	xorl %r9d,%r9d
+	
+	# clone now
+	call do_fork
+	movq %rax,RAX(%rsp)
+	xorl %edi,%edi
+
+	/*
+	 * It isn't worth to check for reschedule here,
+	 * so internally to the x86_64 port you can rely on kernel_thread()
+	 * not to reschedule the child before returning, this avoids the need
+	 * of hacks for example to fork off the per-CPU idle tasks.
+         * [Hopefully no generic code relies on the reschedule -AK]	
+	 */
+	RESTORE_ALL
+	UNFAKE_STACK_FRAME
+	ret
+	CFI_ENDPROC
+
+	
+child_rip:
+	/*
+	 * Here we are in the child and the registers are set as they were
+	 * at kernel_thread() invocation in the parent.
+	 */
+	movq %rdi, %rax
+	movq %rsi, %rdi
+	call *%rax
+	# exit
+	xorq %rdi, %rdi
+	call do_exit
+
+/*
+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
+ *
+ * C extern interface:
+ *	 extern long execve(char *name, char **argv, char **envp)
+ *
+ * asm input arguments:
+ *	rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+ *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *
+ * do_sys_execve asm fallback arguments:
+ *	rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ */
+ENTRY(execve)
+	CFI_STARTPROC
+	FAKE_STACK_FRAME $0
+	SAVE_ALL	
+	call sys_execve
+	movq %rax, RAX(%rsp)	
+	RESTORE_REST
+	testq %rax,%rax
+	je int_ret_from_sys_call
+	RESTORE_ARGS
+	UNFAKE_STACK_FRAME
+	ret
+	CFI_ENDPROC
+
+ENTRY(page_fault)
+	errorentry do_page_fault
+
+ENTRY(coprocessor_error)
+	zeroentry do_coprocessor_error
+
+ENTRY(simd_coprocessor_error)
+	zeroentry do_simd_coprocessor_error	
+
+ENTRY(device_not_available)
+	zeroentry math_state_restore
+
+	/* runs on exception stack */
+ENTRY(debug)
+	CFI_STARTPROC
+	pushq $0
+	CFI_ADJUST_CFA_OFFSET 8		
+	paranoidentry do_debug
+	/* switch back to process stack to restore the state ptrace touched */
+	movq %rax,%rsp	
+	testl $3,CS(%rsp)
+	jnz   paranoid_userspace	
+	jmp paranoid_exit
+	CFI_ENDPROC
+
+	/* runs on exception stack */	
+ENTRY(nmi)
+	CFI_STARTPROC
+	pushq $-1
+	CFI_ADJUST_CFA_OFFSET 8		
+	paranoidentry do_nmi
+	/* ebx:	no swapgs flag */
+paranoid_exit:
+	testl %ebx,%ebx				/* swapgs needed? */
+	jnz paranoid_restore
+paranoid_swapgs:	
+	cli
+	swapgs
+paranoid_restore:	
+	RESTORE_ALL 8
+	iretq
+paranoid_userspace:	
+	cli
+	GET_THREAD_INFO(%rcx)
+	movl threadinfo_flags(%rcx),%edx
+	testl $_TIF_NEED_RESCHED,%edx
+	jnz paranoid_resched
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	jnz paranoid_signal
+	jmp paranoid_swapgs
+paranoid_resched:		
+	sti
+	call schedule
+	jmp paranoid_exit
+paranoid_signal:		
+	sti
+	xorl %esi,%esi /* oldset */
+	movq %rsp,%rdi /* &pt_regs */
+	call do_notify_resume
+	jmp paranoid_exit
+	CFI_ENDPROC
+	
+ENTRY(int3)
+	zeroentry do_int3	
+
+ENTRY(overflow)
+	zeroentry do_overflow
+
+ENTRY(bounds)
+	zeroentry do_bounds
+
+ENTRY(invalid_op)
+	zeroentry do_invalid_op	
+
+ENTRY(coprocessor_segment_overrun)
+	zeroentry do_coprocessor_segment_overrun
+
+ENTRY(reserved)
+	zeroentry do_reserved
+
+	/* runs on exception stack */
+ENTRY(double_fault)
+	CFI_STARTPROC
+	paranoidentry do_double_fault
+	movq %rax,%rsp
+	testl $3,CS(%rsp)
+	jnz paranoid_userspace		
+	jmp paranoid_exit
+	CFI_ENDPROC
+
+ENTRY(invalid_TSS)
+	errorentry do_invalid_TSS
+
+ENTRY(segment_not_present)
+	errorentry do_segment_not_present
+
+	/* runs on exception stack */
+ENTRY(stack_segment)
+	CFI_STARTPROC
+	paranoidentry do_stack_segment
+	movq %rax,%rsp
+	testl $3,CS(%rsp)
+	jnz paranoid_userspace
+	jmp paranoid_exit
+	CFI_ENDPROC
+
+ENTRY(general_protection)
+	errorentry do_general_protection
+
+ENTRY(alignment_check)
+	errorentry do_alignment_check
+
+ENTRY(divide_error)
+	zeroentry do_divide_error
+
+ENTRY(spurious_interrupt_bug)
+	zeroentry do_spurious_interrupt_bug
+
+#ifdef CONFIG_X86_MCE
+	/* runs on exception stack */
+ENTRY(machine_check)
+	CFI_STARTPROC
+	pushq $0
+	CFI_ADJUST_CFA_OFFSET 8	
+	paranoidentry do_machine_check
+	jmp paranoid_exit
+	CFI_ENDPROC
+#endif
+
+ENTRY(call_debug)
+       zeroentry do_call_debug
+
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
new file mode 100644
index 000000000000..d2c42fb99cb3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+
+/* which logical CPU number maps to which CPU (physical APIC ID) */
+u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+extern struct genapic apic_cluster;
+extern struct genapic apic_flat;
+
+struct genapic *genapic = &apic_flat;
+
+
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init clustered_apic_check(void)
+{
+	long i;
+	u8 clusters, max_cluster;
+	u8 id;
+	u8 cluster_cnt[NUM_APIC_CLUSTERS];
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+		/* AMD always uses flat mode right now */
+		genapic = &apic_flat;
+		goto print;
+	}
+
+	memset(cluster_cnt, 0, sizeof(cluster_cnt));
+
+	for (i = 0; i < NR_CPUS; i++) {
+		id = bios_cpu_apicid[i];
+		if (id != BAD_APICID)
+			cluster_cnt[APIC_CLUSTERID(id)]++;
+	}
+
+	clusters = 0;
+	max_cluster = 0;
+	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+		if (cluster_cnt[i] > 0) {
+			++clusters;
+			if (cluster_cnt[i] > max_cluster)
+				max_cluster = cluster_cnt[i];
+		}
+	}
+
+	/*
+	 * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
+	 * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
+	 * else physical mode.
+	 * (We don't use lowest priority delivery + HW APIC IRQ steering, so
+	 * can ignore the clustered logical case and go straight to physical.)
+	 */
+	if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
+		genapic = &apic_flat;
+	else
+		genapic = &apic_cluster;
+
+print:
+	printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+}
+
+/* Same for both flat and clustered. */
+
+void send_IPI_self(int vector)
+{
+	__send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
new file mode 100644
index 000000000000..9703da7202e3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Clustered APIC subarch code.  Up to 255 CPUs, physical delivery.
+ * (A more realistic maximum is around 230 CPUs.)
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116).  So here it goes...
+ */
+static void cluster_init_apic_ldr(void)
+{
+	unsigned long val, id;
+	long i, count;
+	u8 lid;
+	u8 my_id = hard_smp_processor_id();
+	u8 my_cluster = APIC_CLUSTER(my_id);
+
+	/* Create logical APIC IDs by counting CPUs already in cluster. */
+	for (count = 0, i = NR_CPUS; --i >= 0; ) {
+		lid = x86_cpu_to_log_apicid[i];
+		if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
+			++count;
+	}
+	/*
+	 * We only have a 4 wide bitmap in cluster mode.  There's no way
+	 * to get above 60 CPUs and still give each one it's own bit.
+	 * But, we're using physical IRQ delivery, so we don't care.
+	 * Use bit 3 for the 4th through Nth CPU in each cluster.
+	 */
+	if (count >= XAPIC_DEST_CPUS_SHIFT)
+		count = 3;
+	id = my_cluster | (1UL << count);
+	x86_cpu_to_log_apicid[smp_processor_id()] = id;
+	apic_write_around(APIC_DFR, APIC_DFR_CLUSTER);
+	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+	val |= SET_APIC_LOGICAL_ID(id);
+	apic_write_around(APIC_LDR, val);
+}
+
+/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
+
+static cpumask_t cluster_target_cpus(void)
+{
+	return cpumask_of_cpu(0);
+}
+
+static void cluster_send_IPI_mask(cpumask_t mask, int vector)
+{
+	send_IPI_mask_sequence(mask, vector);
+}
+
+static void cluster_send_IPI_allbutself(int vector)
+{
+	cpumask_t mask = cpu_online_map;
+	cpu_clear(smp_processor_id(), mask);
+
+	if (!cpus_empty(mask))
+		cluster_send_IPI_mask(mask, vector);
+}
+
+static void cluster_send_IPI_all(int vector)
+{
+	cluster_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int cluster_apic_id_registered(void)
+{
+	return 1;
+}
+
+static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+	int cpu;
+
+	/*
+	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
+	 * May as well be the first.
+	 */
+	cpu = first_cpu(cpumask);
+	if ((unsigned)cpu < NR_CPUS)
+		return x86_cpu_to_apicid[cpu];
+	else
+		return BAD_APICID;
+}
+
+/* cpuid returns the value latched in the HW at reset, not the APIC ID
+ * register's value.  For any box whose BIOS changes APIC IDs, like
+ * clustered APIC systems, we must use hard_smp_processor_id.
+ *
+ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
+ */
+static unsigned int phys_pkg_id(int index_msb)
+{
+	return hard_smp_processor_id() >> index_msb;
+}
+
+struct genapic apic_cluster = {
+	.name = "clustered",
+	.int_delivery_mode = dest_Fixed,
+	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+	.int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
+	.target_cpus = cluster_target_cpus,
+	.apic_id_registered = cluster_apic_id_registered,
+	.init_apic_ldr = cluster_init_apic_ldr,
+	.send_IPI_all = cluster_send_IPI_all,
+	.send_IPI_allbutself = cluster_send_IPI_allbutself,
+	.send_IPI_mask = cluster_send_IPI_mask,
+	.cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
+	.phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
new file mode 100644
index 000000000000..b4cbbad04226
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Flat APIC subarch code.  Maximum 8 CPUs, logical delivery.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+
+
+static cpumask_t flat_target_cpus(void)
+{
+	return cpu_online_map;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116).  So here it goes...
+ */
+static void flat_init_apic_ldr(void)
+{
+	unsigned long val;
+	unsigned long num, id;
+
+	num = smp_processor_id();
+	id = 1UL << num;
+	x86_cpu_to_log_apicid[num] = id;
+	apic_write_around(APIC_DFR, APIC_DFR_FLAT);
+	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+	val |= SET_APIC_LOGICAL_ID(id);
+	apic_write_around(APIC_LDR, val);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+	/*
+	 * if there are no other CPUs in the system then
+	 * we get an APIC send error if we try to broadcast.
+	 * thus we have to avoid sending IPIs in this case.
+	 */
+	if (num_online_cpus() > 1)
+		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_all(int vector)
+{
+	__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+	unsigned long mask = cpus_addr(cpumask)[0];
+	unsigned long cfg;
+	unsigned long flags;
+
+	local_save_flags(flags);
+	local_irq_disable();
+
+	/*
+	 * Wait for idle.
+	 */
+	apic_wait_icr_idle();
+
+	/*
+	 * prepare target chip field
+	 */
+	cfg = __prepare_ICR2(mask);
+	apic_write_around(APIC_ICR2, cfg);
+
+	/*
+	 * program the ICR
+	 */
+	cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	apic_write_around(APIC_ICR, cfg);
+	local_irq_restore(flags);
+}
+
+static int flat_apic_id_registered(void)
+{
+	return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
+}
+
+static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+	return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+	u32 ebx;
+
+	ebx = cpuid_ebx(1);
+	return ((ebx >> 24) & 0xFF) >> index_msb;
+}
+
+struct genapic apic_flat =  {
+	.name = "flat",
+	.int_delivery_mode = dest_LowestPrio,
+	.int_dest_mode = (APIC_DEST_LOGICAL != 0),
+	.int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
+	.target_cpus = flat_target_cpus,
+	.apic_id_registered = flat_apic_id_registered,
+	.init_apic_ldr = flat_init_apic_ldr,
+	.send_IPI_all = flat_send_IPI_all,
+	.send_IPI_allbutself = flat_send_IPI_allbutself,
+	.send_IPI_mask = flat_send_IPI_mask,
+	.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+	.phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
new file mode 100644
index 000000000000..b6d8725c1f61
--- /dev/null
+++ b/arch/x86_64/kernel/head.S
@@ -0,0 +1,396 @@
+/*
+ *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ *
+ *  $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
+ */
+
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+	
+/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
+ * because we need identity-mapped pages on setup so define __START_KERNEL to
+ * 0x100000 for this stage
+ * 
+ */
+
+	.text
+	.code32
+	.globl startup_32
+/* %bx:	 1 if coming from smp trampoline on secondary cpu */ 
+startup_32:
+	
+	/*
+	 * At this point the CPU runs in 32bit protected mode (CS.D = 1) with
+	 * paging disabled and the point of this file is to switch to 64bit
+	 * long mode with a kernel mapping for kerneland to jump into the
+	 * kernel virtual addresses.
+ 	 * There is no stack until we set one up.
+	 */
+
+	/* Initialize the %ds segment register */
+	movl $__KERNEL_DS,%eax
+	movl %eax,%ds
+
+	/* Load new GDT with the 64bit segments using 32bit descriptor */
+	lgdt	pGDT32 - __START_KERNEL_map
+
+	/* If the CPU doesn't support CPUID this will double fault.
+	 * Unfortunately it is hard to check for CPUID without a stack. 
+	 */
+	
+	/* Check if extended functions are implemented */		
+	movl	$0x80000000, %eax
+	cpuid
+	cmpl	$0x80000000, %eax
+	jbe	no_long_mode
+	/* Check if long mode is implemented */
+	mov	$0x80000001, %eax
+	cpuid
+	btl	$29, %edx
+	jnc	no_long_mode
+
+	/*
+	 * Prepare for entering 64bits mode
+	 */
+
+	/* Enable PAE mode */
+	xorl	%eax, %eax
+	btsl	$5, %eax
+	movl	%eax, %cr4
+
+	/* Setup early boot stage 4 level pagetables */
+	movl	$(init_level4_pgt - __START_KERNEL_map), %eax
+	movl	%eax, %cr3
+
+	/* Setup EFER (Extended Feature Enable Register) */
+	movl	$MSR_EFER, %ecx
+	rdmsr
+
+	/* Enable Long Mode */
+	btsl	$_EFER_LME, %eax
+				
+	/* Make changes effective */
+	wrmsr
+
+	xorl	%eax, %eax
+	btsl	$31, %eax			/* Enable paging and in turn activate Long Mode */
+	btsl	$0, %eax			/* Enable protected mode */
+	/* Make changes effective */
+	movl	%eax, %cr0
+	/*
+	 * At this point we're in long mode but in 32bit compatibility mode
+	 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+	 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
+	 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+	 */
+	ljmp	$__KERNEL_CS, $(startup_64 - __START_KERNEL_map)
+
+	.code64
+	.org 0x100	
+	.globl startup_64
+startup_64:
+	/* We come here either from startup_32
+	 * or directly from a 64bit bootloader.
+	 * Since we may have come directly from a bootloader we
+	 * reload the page tables here.
+	 */
+
+	/* Enable PAE mode and PGE */
+	xorq	%rax, %rax
+	btsq	$5, %rax
+	btsq	$7, %rax
+	movq	%rax, %cr4
+
+	/* Setup early boot stage 4 level pagetables. */
+	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
+	movq	%rax, %cr3
+
+	/* Check if nx is implemented */
+	movl	$0x80000001, %eax
+	cpuid
+	movl	%edx,%edi
+
+	/* Setup EFER (Extended Feature Enable Register) */
+	movl	$MSR_EFER, %ecx
+	rdmsr
+
+	/* Enable System Call */
+	btsl	$_EFER_SCE, %eax
+
+	/* No Execute supported? */
+	btl	$20,%edi
+	jnc     1f
+	btsl	$_EFER_NX, %eax
+1:
+	/* Make changes effective */
+	wrmsr
+
+	/* Setup cr0 */
+	xorq	%rax, %rax
+	btsq	$31, %rax			/* Enable paging */
+	btsq	$0, %rax			/* Enable protected mode */
+	btsq	$1, %rax			/* Enable MP */
+	btsq	$4, %rax			/* Enable ET */
+	btsq	$5, %rax			/* Enable NE */
+	btsq	$16, %rax			/* Enable WP */
+	btsq	$18, %rax			/* Enable AM */
+	/* Make changes effective */
+	movq	%rax, %cr0
+
+	/* Setup a boot time stack */
+	movq init_rsp(%rip),%rsp
+
+	/* zero EFLAGS after setting rsp */
+	pushq $0
+	popfq
+
+	/*
+	 * We must switch to a new descriptor in kernel space for the GDT
+	 * because soon the kernel won't have access anymore to the userspace
+	 * addresses where we're currently running on. We have to do that here
+	 * because in 32bit we couldn't load a 64bit linear address.
+	 */
+	lgdt	cpu_gdt_descr
+
+	/* 
+	 * Setup up a dummy PDA. this is just for some early bootup code
+	 * that does in_interrupt() 
+	 */ 
+	movl	$MSR_GS_BASE,%ecx
+	movq	$empty_zero_page,%rax
+	movq    %rax,%rdx
+	shrq	$32,%rdx
+	wrmsr	
+
+	/* set up data segments. actually 0 would do too */
+	movl $__KERNEL_DS,%eax
+	movl %eax,%ds	
+	movl %eax,%ss
+	movl %eax,%es
+			
+	/* esi is pointer to real mode structure with interesting info.
+	   pass it to C */
+	movl	%esi, %edi
+	
+	/* Finally jump to run C code and to be on real kernel address
+	 * Since we are running on identity-mapped space we have to jump
+	 * to the full 64bit address , this is only possible as indirect
+	 * jump
+	 */
+	movq	initial_code(%rip),%rax
+	jmp	*%rax
+
+	/* SMP bootup changes these two */	
+	.globl	initial_code
+initial_code:
+	.quad	x86_64_start_kernel
+	.globl init_rsp
+init_rsp:
+	.quad  init_thread_union+THREAD_SIZE-8
+
+ENTRY(early_idt_handler)
+	xorl %eax,%eax
+	movq 8(%rsp),%rsi	# get rip
+	movq (%rsp),%rdx
+	movq %cr2,%rcx
+	leaq early_idt_msg(%rip),%rdi
+	call early_printk
+1:	hlt
+	jmp 1b
+
+early_idt_msg:
+	.asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
+
+.code32
+ENTRY(no_long_mode)
+	/* This isn't an x86-64 CPU so hang */
+1:
+	jmp	1b
+
+.org 0xf00
+	.globl pGDT32
+pGDT32:
+	.word	gdt_end-cpu_gdt_table
+	.long	cpu_gdt_table-__START_KERNEL_map
+
+.org 0xf10	
+ljumpvector:
+	.long	startup_64-__START_KERNEL_map
+	.word	__KERNEL_CS
+
+ENTRY(stext)
+ENTRY(_stext)
+
+	/*
+	 * This default setting generates an ident mapping at address 0x100000
+	 * and a mapping for the kernel that precisely maps virtual address
+	 * 0xffffffff80000000 to physical address 0x000000. (always using
+	 * 2Mbyte large pages provided by PAE mode)
+	 */
+.org 0x1000
+ENTRY(init_level4_pgt)
+	.quad	0x0000000000102007		/* -> level3_ident_pgt */
+	.fill	255,8,0
+	.quad	0x000000000010a007
+	.fill	254,8,0
+	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+	.quad	0x0000000000103007		/* -> level3_kernel_pgt */
+
+.org 0x2000
+ENTRY(level3_ident_pgt)
+	.quad	0x0000000000104007
+	.fill	511,8,0
+
+.org 0x3000
+ENTRY(level3_kernel_pgt)
+	.fill	510,8,0
+	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+	.quad	0x0000000000105007		/* -> level2_kernel_pgt */
+	.fill	1,8,0
+
+.org 0x4000
+ENTRY(level2_ident_pgt)
+	/* 40MB for bootup. 	*/
+	.quad	0x0000000000000283
+	.quad	0x0000000000200183
+	.quad	0x0000000000400183
+	.quad	0x0000000000600183
+	.quad	0x0000000000800183
+	.quad	0x0000000000A00183
+	.quad	0x0000000000C00183
+	.quad	0x0000000000E00183
+	.quad	0x0000000001000183
+	.quad	0x0000000001200183
+	.quad	0x0000000001400183
+	.quad	0x0000000001600183
+	.quad	0x0000000001800183
+	.quad	0x0000000001A00183
+	.quad	0x0000000001C00183
+	.quad	0x0000000001E00183
+	.quad	0x0000000002000183
+	.quad	0x0000000002200183
+	.quad	0x0000000002400183
+	.quad	0x0000000002600183
+	/* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
+	.globl temp_boot_pmds
+temp_boot_pmds:
+	.fill	492,8,0
+	
+.org 0x5000
+ENTRY(level2_kernel_pgt)
+	/* 40MB kernel mapping. The kernel code cannot be bigger than that.
+	   When you change this change KERNEL_TEXT_SIZE in page.h too. */
+	/* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
+	.quad	0x0000000000000183
+	.quad	0x0000000000200183
+	.quad	0x0000000000400183
+	.quad	0x0000000000600183
+	.quad	0x0000000000800183
+	.quad	0x0000000000A00183
+	.quad	0x0000000000C00183
+	.quad	0x0000000000E00183
+	.quad	0x0000000001000183
+	.quad	0x0000000001200183
+	.quad	0x0000000001400183
+	.quad	0x0000000001600183
+	.quad	0x0000000001800183
+	.quad	0x0000000001A00183
+	.quad	0x0000000001C00183
+	.quad	0x0000000001E00183
+	.quad	0x0000000002000183
+	.quad	0x0000000002200183
+	.quad	0x0000000002400183
+	.quad	0x0000000002600183
+	/* Module mapping starts here */
+	.fill	492,8,0
+
+.org 0x6000
+ENTRY(empty_zero_page)
+
+.org 0x7000
+ENTRY(empty_bad_page)
+
+.org 0x8000
+ENTRY(empty_bad_pte_table)
+
+.org 0x9000
+ENTRY(empty_bad_pmd_table)
+
+.org 0xa000
+ENTRY(level3_physmem_pgt)
+	.quad	0x0000000000105007		/* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+
+	.org 0xb000
+#ifdef CONFIG_ACPI_SLEEP
+ENTRY(wakeup_level4_pgt)
+	.quad	0x0000000000102007		/* -> level3_ident_pgt */
+	.fill	255,8,0
+	.quad	0x000000000010a007
+	.fill	254,8,0
+	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+	.quad	0x0000000000103007		/* -> level3_kernel_pgt */
+#endif
+
+	.data
+
+	.align 16
+	.globl cpu_gdt_descr
+cpu_gdt_descr:
+	.word	gdt_end-cpu_gdt_table
+gdt:
+	.quad	cpu_gdt_table
+#ifdef CONFIG_SMP
+	.rept	NR_CPUS-1
+	.word	0
+	.quad	0
+	.endr
+#endif
+
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types  kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout 
+ */
+		 		
+.align L1_CACHE_BYTES
+
+/* The TLS descriptors are currently at a different place compared to i386.
+   Hopefully nobody expects them at a fixed place (Wine?) */
+	
+ENTRY(cpu_gdt_table)
+	.quad	0x0000000000000000	/* NULL descriptor */
+	.quad	0x008f9a000000ffff	/* __KERNEL_COMPAT32_CS */	
+	.quad	0x00af9a000000ffff	/* __KERNEL_CS */
+	.quad	0x00cf92000000ffff	/* __KERNEL_DS */
+	.quad	0x00cffa000000ffff	/* __USER32_CS */
+	.quad	0x00cff2000000ffff	/* __USER_DS, __USER32_DS  */		
+	.quad	0x00affa000000ffff	/* __USER_CS */
+	.quad	0x00cf9a000000ffff	/* __KERNEL32_CS */
+	.quad	0,0			/* TSS */
+	.quad	0,0			/* LDT */
+	.quad   0,0,0			/* three TLS descriptors */ 
+	.quad	0x00009a000000ffff	/* __KERNEL16_CS - 16bit PM for S3 wakeup. */
+					/* base must be patched for real base address. */
+gdt_end:	
+	/* asm/segment.h:GDT_ENTRIES must match this */	
+	/* This should be a multiple of the cache line size */
+	/* GDTs of other CPUs: */	
+	.fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table)
+
+	.align  L1_CACHE_BYTES
+ENTRY(idt_table)	
+	.rept   256
+	.quad   0
+	.quad 	0
+	.endr
+
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
new file mode 100644
index 000000000000..6cad46c98a23
--- /dev/null
+++ b/arch/x86_64/kernel/head64.c
@@ -0,0 +1,117 @@
+/*
+ *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *
+ *  $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/bootsetup.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+
+/* Don't add a printk in there. printk relies on the PDA which is not initialized 
+   yet. */
+static void __init clear_bss(void)
+{
+	extern char __bss_start[], __bss_end[];
+	memset(__bss_start, 0,
+	       (unsigned long) __bss_end - (unsigned long) __bss_start);
+}
+
+extern char x86_boot_params[2048];
+
+#define NEW_CL_POINTER		0x228	/* Relative to real mode data */
+#define OLD_CL_MAGIC_ADDR	0x90020
+#define OLD_CL_MAGIC            0xA33F
+#define OLD_CL_BASE_ADDR        0x90000
+#define OLD_CL_OFFSET           0x90022
+
+extern char saved_command_line[];
+
+static void __init copy_bootdata(char *real_mode_data)
+{
+	int new_data;
+	char * command_line;
+
+	memcpy(x86_boot_params, real_mode_data, 2048); 
+	new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
+	if (!new_data) {
+		if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
+			printk("so old bootloader that it does not support commandline?!\n");
+			return;
+		}
+		new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
+		printk("old bootloader convention, maybe loadlin?\n");
+	}
+	command_line = (char *) ((u64)(new_data));
+	memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
+	printk("Bootdata ok (command line is %s)\n", saved_command_line);	
+}
+
+static void __init setup_boot_cpu_data(void)
+{
+	unsigned int dummy, eax;
+
+	/* get vendor info */
+	cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
+	      (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
+	      (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
+	      (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
+
+	/* get cpu type */
+	cpuid(1, &eax, &dummy, &dummy,
+		(unsigned int *) &boot_cpu_data.x86_capability);
+	boot_cpu_data.x86 = (eax >> 8) & 0xf;
+	boot_cpu_data.x86_model = (eax >> 4) & 0xf;
+	boot_cpu_data.x86_mask = eax & 0xf;
+}
+
+extern char _end[];
+
+void __init x86_64_start_kernel(char * real_mode_data)
+{
+	char *s;
+	int i;
+
+	for (i = 0; i < 256; i++)
+		set_intr_gate(i, early_idt_handler);
+	asm volatile("lidt %0" :: "m" (idt_descr));
+	clear_bss();
+	pda_init(0);
+	copy_bootdata(real_mode_data);
+#ifdef CONFIG_SMP
+	cpu_set(0, cpu_online_map);
+#endif
+	/* default console: */
+	if (!strstr(saved_command_line, "console="))
+		strcat(saved_command_line, " console=tty0"); 
+	s = strstr(saved_command_line, "earlyprintk=");
+	if (s != NULL)
+		setup_early_printk(s);
+#ifdef CONFIG_DISCONTIGMEM
+	s = strstr(saved_command_line, "numa=");
+	if (s != NULL)
+		numa_setup(s+5);
+#endif
+#ifdef CONFIG_X86_IO_APIC
+	if (strstr(saved_command_line, "disableapic"))
+		disable_apic = 1;
+#endif
+	/* You need early console to see that */
+	if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
+		panic("Kernel too big for kernel mapping\n");
+
+	setup_boot_cpu_data();
+	start_kernel();
+}
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
new file mode 100644
index 000000000000..ba139cac57ce
--- /dev/null
+++ b/arch/x86_64/kernel/i387.c
@@ -0,0 +1,155 @@
+/*
+ *  linux/arch/x86_64/kernel/i387.c
+ *
+ *  Copyright (C) 1994 Linus Torvalds
+ *  Copyright (C) 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  x86-64 rework 2002 Andi Kleen. 
+ *  Does direct fxsave in and out of user space now for signal handlers.
+ *  All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
+ *  the 64bit user space sees a FXSAVE frame directly. 
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+unsigned int mxcsr_feature_mask = 0xffffffff;
+
+void mxcsr_feature_mask_init(void)
+{
+	unsigned int mask;
+	clts();
+	memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+	asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
+	mask = current->thread.i387.fxsave.mxcsr_mask;
+	if (mask == 0) mask = 0x0000ffbf;
+	mxcsr_feature_mask &= mask;
+	stts();
+}
+
+/*
+ * Called at bootup to set up the initial FPU state that is later cloned
+ * into all processes.
+ */
+void __init fpu_init(void)
+{
+	unsigned long oldcr0 = read_cr0();
+	extern void __bad_fxsave_alignment(void);
+		
+	if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
+		__bad_fxsave_alignment();
+	set_in_cr4(X86_CR4_OSFXSR);
+	set_in_cr4(X86_CR4_OSXMMEXCPT);
+
+	write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
+
+	mxcsr_feature_mask_init();
+	/* clean state in init */
+	current_thread_info()->status = 0;
+	clear_used_math();
+}
+
+void init_fpu(struct task_struct *child)
+{
+	if (tsk_used_math(child)) {
+		if (child == current)
+			unlazy_fpu(child);
+		return;
+	}	
+	memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+	child->thread.i387.fxsave.cwd = 0x37f;
+	child->thread.i387.fxsave.mxcsr = 0x1f80;
+	/* only the device not available exception or ptrace can call init_fpu */
+	set_stopped_child_used_math(child);
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+int save_i387(struct _fpstate __user *buf)
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	{ 
+		extern void bad_user_i387_struct(void); 
+		if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave))
+			bad_user_i387_struct();
+	} 
+
+	if ((unsigned long)buf % 16) 
+		printk("save_i387: bad fpstate %p\n",buf); 
+
+	if (!used_math())
+		return 0;
+	clear_used_math(); /* trigger finit */
+	if (tsk->thread_info->status & TS_USEDFPU) {
+		err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
+		if (err) return err;
+		stts();
+		} else {
+		if (__copy_to_user(buf, &tsk->thread.i387.fxsave, 
+				   sizeof(struct i387_fxsave_struct)))
+			return -1;
+	} 
+		return 1;
+}
+
+/*
+ * ptrace request handlers.
+ */
+
+int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
+{
+	init_fpu(tsk);
+	return __copy_to_user(buf, &tsk->thread.i387.fxsave,
+			       sizeof(struct user_i387_struct)) ? -EFAULT : 0;
+}
+
+int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
+{
+	if (__copy_from_user(&tsk->thread.i387.fxsave, buf, 
+			     sizeof(struct user_i387_struct)))
+		return -EFAULT;
+		return 0;
+}
+
+/*
+ * FPU state for core dumps.
+ */
+
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+	struct task_struct *tsk = current;
+
+	if (!used_math())
+		return 0;
+
+	unlazy_fpu(tsk);
+	memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 
+	return 1; 
+}
+
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+	int fpvalid = !!tsk_used_math(tsk);
+
+	if (fpvalid) {
+		if (tsk == current)
+			unlazy_fpu(tsk);
+		memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 	
+}
+	return fpvalid;
+}
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
new file mode 100644
index 000000000000..d52701246010
--- /dev/null
+++ b/arch/x86_64/kernel/i8259.c
@@ -0,0 +1,579 @@
+#include <linux/linkage.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+
+#include <linux/irq.h>
+
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+
+#define BI(x,y) \
+	BUILD_IRQ(x##y)
+
+#define BUILD_16_IRQS(x) \
+	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+
+#define BUILD_14_IRQS(x) \
+	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+	BI(x,c) BI(x,d)
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x20-0x2f)
+ */
+BUILD_16_IRQS(0x0)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these 
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+		   BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
+
+#ifdef CONFIG_PCI_MSI
+	BUILD_14_IRQS(0xe)
+#endif
+
+#endif
+
+#undef BUILD_16_IRQS
+#undef BUILD_14_IRQS
+#undef BI
+
+
+#define IRQ(x,y) \
+	IRQ##x##y##_interrupt
+
+#define IRQLIST_16(x) \
+	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+
+#define IRQLIST_14(x) \
+	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+	IRQ(x,c), IRQ(x,d)
+
+void (*interrupt[NR_IRQS])(void) = {
+	IRQLIST_16(0x0),
+
+#ifdef CONFIG_X86_IO_APIC
+			 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
+	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+	IRQLIST_16(0xc), IRQLIST_16(0xd)
+
+#ifdef CONFIG_PCI_MSI
+	, IRQLIST_14(0xe)
+#endif
+
+#endif
+};
+
+#undef IRQ
+#undef IRQLIST_16
+#undef IRQLIST_14
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
+
+DEFINE_SPINLOCK(i8259A_lock);
+
+static void end_8259A_irq (unsigned int irq)
+{
+	if (irq > 256) { 
+		char var;
+		printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info); 
+
+		BUG(); 
+	}
+
+	if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
+	    irq_desc[irq].action)
+		enable_8259A_irq(irq);
+}
+
+#define shutdown_8259A_irq	disable_8259A_irq
+
+static void mask_and_ack_8259A(unsigned int);
+
+static unsigned int startup_8259A_irq(unsigned int irq)
+{ 
+	enable_8259A_irq(irq);
+	return 0; /* never anything pending */
+}
+
+static struct hw_interrupt_type i8259A_irq_type = {
+	"XT-PIC",
+	startup_8259A_irq,
+	shutdown_8259A_irq,
+	enable_8259A_irq,
+	disable_8259A_irq,
+	mask_and_ack_8259A,
+	end_8259A_irq,
+	NULL
+};
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+static unsigned int cached_irq_mask = 0xffff;
+
+#define __byte(x,y) 	(((unsigned char *)&(y))[x])
+#define cached_21	(__byte(0,cached_irq_mask))
+#define cached_A1	(__byte(1,cached_irq_mask))
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+void disable_8259A_irq(unsigned int irq)
+{
+	unsigned int mask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask |= mask;
+	if (irq & 8)
+		outb(cached_A1,0xA1);
+	else
+		outb(cached_21,0x21);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void enable_8259A_irq(unsigned int irq)
+{
+	unsigned int mask = ~(1 << irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask &= mask;
+	if (irq & 8)
+		outb(cached_A1,0xA1);
+	else
+		outb(cached_21,0x21);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+int i8259A_irq_pending(unsigned int irq)
+{
+	unsigned int mask = 1<<irq;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	if (irq < 8)
+		ret = inb(0x20) & mask;
+	else
+		ret = inb(0xA0) & (mask >> 8);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	return ret;
+}
+
+void make_8259A_irq(unsigned int irq)
+{
+	disable_irq_nosync(irq);
+	io_apic_irqs &= ~(1<<irq);
+	irq_desc[irq].handler = &i8259A_irq_type;
+	enable_irq(irq);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+	int value;
+	int irqmask = 1<<irq;
+
+	if (irq < 8) {
+		outb(0x0B,0x20);		/* ISR register */
+		value = inb(0x20) & irqmask;
+		outb(0x0A,0x20);		/* back to the IRR register */
+		return value;
+	}
+	outb(0x0B,0xA0);		/* ISR register */
+	value = inb(0xA0) & (irqmask >> 8);
+	outb(0x0A,0xA0);		/* back to the IRR register */
+	return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+	unsigned int irqmask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	/*
+	 * Lightweight spurious IRQ detection. We do not want
+	 * to overdo spurious IRQ handling - it's usually a sign
+	 * of hardware problems, so we only do the checks we can
+	 * do without slowing down good hardware unnecesserily.
+	 *
+	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
+	 * usually resulting from the 8259A-1|2 PICs) occur
+	 * even if the IRQ is masked in the 8259A. Thus we
+	 * can check spurious 8259A IRQs without doing the
+	 * quite slow i8259A_irq_real() call for every IRQ.
+	 * This does not cover 100% of spurious interrupts,
+	 * but should be enough to warn the user that there
+	 * is something bad going on ...
+	 */
+	if (cached_irq_mask & irqmask)
+		goto spurious_8259A_irq;
+	cached_irq_mask |= irqmask;
+
+handle_real_irq:
+	if (irq & 8) {
+		inb(0xA1);		/* DUMMY - (do we need this?) */
+		outb(cached_A1,0xA1);
+		outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
+		outb(0x62,0x20);	/* 'Specific EOI' to master-IRQ2 */
+	} else {
+		inb(0x21);		/* DUMMY - (do we need this?) */
+		outb(cached_21,0x21);
+		outb(0x60+irq,0x20);	/* 'Specific EOI' to master */
+	}
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+	return;
+
+spurious_8259A_irq:
+	/*
+	 * this is the slow path - should happen rarely.
+	 */
+	if (i8259A_irq_real(irq))
+		/*
+		 * oops, the IRQ _is_ in service according to the
+		 * 8259A - not spurious, go handle it.
+		 */
+		goto handle_real_irq;
+
+	{
+		static int spurious_irq_mask;
+		/*
+		 * At this point we can be sure the IRQ is spurious,
+		 * lets ACK and report it. [once per IRQ]
+		 */
+		if (!(spurious_irq_mask & irqmask)) {
+			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+			spurious_irq_mask |= irqmask;
+		}
+		atomic_inc(&irq_err_count);
+		/*
+		 * Theoretically we do not have to handle this IRQ,
+		 * but in Linux this does not cause problems and is
+		 * simpler for us.
+		 */
+		goto handle_real_irq;
+	}
+}
+
+void init_8259A(int auto_eoi)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(0xff, 0x21);	/* mask all of 8259A-1 */
+	outb(0xff, 0xA1);	/* mask all of 8259A-2 */
+
+	/*
+	 * outb_p - this has to work on a wide range of PC hardware.
+	 */
+	outb_p(0x11, 0x20);	/* ICW1: select 8259A-1 init */
+	outb_p(0x20 + 0, 0x21);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+	outb_p(0x04, 0x21);	/* 8259A-1 (the master) has a slave on IR2 */
+	if (auto_eoi)
+		outb_p(0x03, 0x21);	/* master does Auto EOI */
+	else
+		outb_p(0x01, 0x21);	/* master expects normal EOI */
+
+	outb_p(0x11, 0xA0);	/* ICW1: select 8259A-2 init */
+	outb_p(0x20 + 8, 0xA1);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+	outb_p(0x02, 0xA1);	/* 8259A-2 is a slave on master's IR2 */
+	outb_p(0x01, 0xA1);	/* (slave's support for AEOI in flat mode
+				    is to be investigated) */
+
+	if (auto_eoi)
+		/*
+		 * in AEOI mode we just have to mask the interrupt
+		 * when acking.
+		 */
+		i8259A_irq_type.ack = disable_8259A_irq;
+	else
+		i8259A_irq_type.ack = mask_and_ack_8259A;
+
+	udelay(100);		/* wait for 8259A to initialize */
+
+	outb(cached_21, 0x21);	/* restore master IRQ mask */
+	outb(cached_A1, 0xA1);	/* restore slave IRQ mask */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+	outb(trigger[0], 0x4d0);
+	outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+	/* IRQ 0,1,2,8,13 are marked as reserved */
+	trigger[0] = inb(0x4d0) & 0xF8;
+	trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static int i8259A_resume(struct sys_device *dev)
+{
+	init_8259A(0);
+	restore_ELCR(irq_trigger);
+	return 0;
+}
+
+static int i8259A_suspend(struct sys_device *dev, u32 state)
+{
+	save_ELCR(irq_trigger);
+	return 0;
+}
+
+static struct sysdev_class i8259_sysdev_class = {
+	set_kset_name("i8259"),
+	.suspend = i8259A_suspend,
+	.resume = i8259A_resume,
+};
+
+static struct sys_device device_i8259A = {
+	.id	= 0,
+	.cls	= &i8259_sysdev_class,
+};
+
+static int __init i8259A_init_sysfs(void)
+{
+	int error = sysdev_class_register(&i8259_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_i8259A);
+	return error;
+}
+
+device_initcall(i8259A_init_sysfs);
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+
+static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+
+void __init init_ISA_irqs (void)
+{
+	int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = NULL;
+		irq_desc[i].depth = 1;
+
+		if (i < 16) {
+			/*
+			 * 16 old-style INTA-cycle interrupts:
+			 */
+			irq_desc[i].handler = &i8259A_irq_type;
+		} else {
+			/*
+			 * 'high' PCI IRQs filled in on demand
+			 */
+			irq_desc[i].handler = &no_irq_type;
+		}
+	}
+}
+
+void apic_timer_interrupt(void);
+void spurious_interrupt(void);
+void error_interrupt(void);
+void reschedule_interrupt(void);
+void call_function_interrupt(void);
+void invalidate_interrupt(void);
+void thermal_interrupt(void);
+void i8254_timer_resume(void);
+
+static void setup_timer(void)
+{
+	outb_p(0x34,0x43);		/* binary, mode 2, LSB/MSB, ch 0 */
+	udelay(10);
+	outb_p(LATCH & 0xff , 0x40);	/* LSB */
+	udelay(10);
+	outb(LATCH >> 8 , 0x40);	/* MSB */
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+	setup_timer();
+	return 0;
+}
+
+void i8254_timer_resume(void)
+{
+	setup_timer();
+}
+
+static struct sysdev_class timer_sysclass = {
+	set_kset_name("timer"),
+	.resume		= timer_resume,
+};
+
+static struct sys_device device_timer = {
+	.id		= 0,
+	.cls		= &timer_sysclass,
+};
+
+static int __init init_timer_sysfs(void)
+{
+	int error = sysdev_class_register(&timer_sysclass);
+	if (!error)
+		error = sysdev_register(&device_timer);
+	return error;
+}
+
+device_initcall(init_timer_sysfs);
+
+void __init init_IRQ(void)
+{
+	int i;
+
+	init_ISA_irqs();
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) { 
+			set_intr_gate(vector, interrupt[i]);
+	}
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * IRQ0 must be given a fixed assignment and initialized,
+	 * because it's used before the IO-APIC is set up.
+	 */
+	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPI for invalidation */
+	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+	/* IPI for generic function call */
+	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+#endif	
+	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/* self generated IPI for local APIC timer */
+	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+	/* IPI vectors for APIC spurious and error interrupts */
+	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+
+	/*
+	 * Set the clock to HZ Hz, we already have a valid
+	 * vector now:
+	 */
+	setup_timer();
+
+	if (!acpi_ioapic)
+		setup_irq(2, &irq2);
+}
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
new file mode 100644
index 000000000000..c4dc91491157
--- /dev/null
+++ b/arch/x86_64/kernel/init_task.c
@@ -0,0 +1,49 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+
+EXPORT_SYMBOL(init_mm);
+
+/*
+ * Initial task structure.
+ *
+ * We need to make sure that this is 8192-byte aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union 
+	__attribute__((__section__(".data.init_task"))) =
+		{ INIT_THREAD_INFO(init_task) };
+
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+
+EXPORT_SYMBOL(init_task);
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data.cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */ 
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp;
+
+#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
new file mode 100644
index 000000000000..adebe70c180e
--- /dev/null
+++ b/arch/x86_64/kernel/io_apic.c
@@ -0,0 +1,1982 @@
+/*
+ *	Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *	Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ *	Many thanks to Stig Venaas for trying out countless experimental
+ *	patches and reporting/debugging problems patiently!
+ *
+ *	(c) 1999, Multiple IO-APIC support, developed by
+ *	Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *	further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *	and Ingo Molnar <mingo@redhat.com>
+ *
+ *	Fixes
+ *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs;
+ *					thanks to Eric Gilmore
+ *					and Rolf G. Tews
+ *					for testing these extensively
+ *	Paul Diefenbaugh	:	Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/smp_lock.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/sysdev.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/mach_apic.h>
+
+#define __apicdebuginit  __init
+
+int sis_apic_bug; /* not actually supported, dummy for compile */
+
+static DEFINE_SPINLOCK(ioapic_lock);
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+static struct irq_pin_list {
+	short apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+
+int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+#ifdef CONFIG_PCI_MSI
+#define vector_to_irq(vector) 	\
+	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
+#else
+#define vector_to_irq(vector)	(vector)
+#endif
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+	static int first_free_entry = NR_IRQS;
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	while (entry->next)
+		entry = irq_2_pin + entry->next;
+
+	if (entry->pin != -1) {
+		entry->next = first_free_entry;
+		entry = irq_2_pin + entry->next;
+		if (++first_free_entry >= PIN_MAP_SIZE)
+			panic("io_apic.c: whoops");
+	}
+	entry->apic = apic;
+	entry->pin = pin;
+}
+
+#define __DO_ACTION(R, ACTION, FINAL)					\
+									\
+{									\
+	int pin;							\
+	struct irq_pin_list *entry = irq_2_pin + irq;			\
+									\
+	for (;;) {							\
+		unsigned int reg;					\
+		pin = entry->pin;					\
+		if (pin == -1)						\
+			break;						\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+		reg ACTION;						\
+		io_apic_modify(entry->apic, reg);			\
+		if (!entry->next)					\
+			break;						\
+		entry = irq_2_pin + entry->next;			\
+	}								\
+	FINAL;								\
+}
+
+#define DO_ACTION(name,R,ACTION, FINAL)					\
+									\
+	static void name##_IO_APIC_irq (unsigned int irq)		\
+	__DO_ACTION(R, ACTION, FINAL)
+
+DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
+						/* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, )
+						/* mask = 0 */
+
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__mask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__unmask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+	struct IO_APIC_route_entry entry;
+	unsigned long flags;
+
+	/* Check delivery_mode to be sure we're not clearing an SMI pin */
+	spin_lock_irqsave(&ioapic_lock, flags);
+	*(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
+	*(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	if (entry.delivery_mode == dest_SMI)
+		return;
+	/*
+	 * Disable it in the IO-APIC irq-routing table:
+	 */
+	memset(&entry, 0, sizeof(entry));
+	entry.mask = 1;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC (void)
+{
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++)
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+			clear_IO_APIC_pin(apic, pin);
+}
+
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+int skip_ioapic_setup;
+int ioapic_force;
+
+/* dummy parsing: see setup.c */
+
+static int __init disable_ioapic_setup(char *str)
+{
+	skip_ioapic_setup = 1;
+	return 1;
+}
+
+static int __init enable_ioapic_setup(char *str)
+{
+	ioapic_force = 1;
+	skip_ioapic_setup = 0;
+	return 1;
+}
+
+__setup("noapic", disable_ioapic_setup);
+__setup("apic", enable_ioapic_setup);
+
+#include <asm/pci-direct.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+
+/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
+   off. Check for an Nvidia or VIA PCI bridge and turn it off.
+   Use pci direct infrastructure because this runs before the PCI subsystem. 
+
+   Can be overwritten with "apic"
+
+   And another hack to disable the IOMMU on VIA chipsets.
+
+   Kludge-O-Rama. */
+void __init check_ioapic(void) 
+{ 
+	int num,slot,func; 
+	if (ioapic_force) 
+		return; 
+
+	/* Poor man's PCI discovery */
+	for (num = 0; num < 32; num++) { 
+		for (slot = 0; slot < 32; slot++) { 
+			for (func = 0; func < 8; func++) { 
+				u32 class;
+				u32 vendor;
+				u8 type;
+				class = read_pci_config(num,slot,func,
+							PCI_CLASS_REVISION);
+				if (class == 0xffffffff)
+					break; 
+
+		       		if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+					continue; 
+
+				vendor = read_pci_config(num, slot, func, 
+							 PCI_VENDOR_ID);
+				vendor &= 0xffff;
+				switch (vendor) { 
+				case PCI_VENDOR_ID_VIA:
+#ifdef CONFIG_GART_IOMMU
+					if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) ||
+					     force_iommu) &&
+					    !iommu_aperture_allowed) {
+						printk(KERN_INFO
+    "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
+						iommu_aperture_disabled = 1;
+					}
+#endif
+					return;
+				case PCI_VENDOR_ID_NVIDIA:
+#ifdef CONFIG_ACPI
+					/* All timer overrides on Nvidia
+				           seem to be wrong. Skip them. */
+					acpi_skip_timer_override = 1;
+					printk(KERN_INFO 
+	     "Nvidia board detected. Ignoring ACPI timer override.\n");
+#endif
+					/* RED-PEN skip them on mptables too? */
+					return;
+				} 
+
+				/* No multi-function device? */
+				type = read_pci_config_byte(num,slot,func,
+							    PCI_HEADER_TYPE);
+				if (!(type & 0x80))
+					break;
+			} 
+		}
+	}
+} 
+
+static int __init ioapic_pirq_setup(char *str)
+{
+	int i, max;
+	int ints[MAX_PIRQS+1];
+
+	get_options(str, ARRAY_SIZE(ints), ints);
+
+	for (i = 0; i < MAX_PIRQS; i++)
+		pirq_entries[i] = -1;
+
+	pirqs_enabled = 1;
+	apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
+	max = MAX_PIRQS;
+	if (ints[0] < MAX_PIRQS)
+		max = ints[0];
+
+	for (i = 0; i < max; i++) {
+		apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+		/*
+		 * PIRQs are mapped upside down, usually.
+		 */
+		pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+	}
+	return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++)
+		if (mp_irqs[i].mpc_irqtype == type &&
+		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mpc_dstirq == pin)
+			return i;
+
+	return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].mpc_srcbus;
+
+		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+		     mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+		    (mp_irqs[i].mpc_irqtype == type) &&
+		    (mp_irqs[i].mpc_srcbusirq == irq))
+
+			return mp_irqs[i].mpc_dstirq;
+	}
+	return -1;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+	int apic, i, best_guess = -1;
+
+	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+		bus, slot, pin);
+	if (mp_bus_id_to_pci_bus[bus] == -1) {
+		apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+		return -1;
+	}
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].mpc_srcbus;
+
+		for (apic = 0; apic < nr_ioapics; apic++)
+			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+				break;
+
+		if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+		    !mp_irqs[i].mpc_irqtype &&
+		    (bus == lbus) &&
+		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+
+			if (!(apic || IO_APIC_IRQ(irq)))
+				continue;
+
+			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+				return irq;
+			/*
+			 * Use the first all-but-pin matching entry as a
+			 * best-guess fuzzy result for broken mptables.
+			 */
+			if (best_guess < 0)
+				best_guess = irq;
+		}
+	}
+	return best_guess;
+}
+
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+	if (irq < 16) {
+		unsigned int port = 0x4d0 + (irq >> 3);
+		return (inb(port) >> (irq & 7)) & 1;
+	}
+	apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
+	return 0;
+}
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_polarity(idx)	(0)
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx)	(0)
+#define default_ISA_polarity(idx)	(0)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx)	(1)
+#define default_PCI_polarity(idx)	(1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx)	(1)
+#define default_MCA_polarity(idx)	(0)
+
+static int __init MPBIOS_polarity(int idx)
+{
+	int bus = mp_irqs[idx].mpc_srcbus;
+	int polarity;
+
+	/*
+	 * Determine IRQ line polarity (high active or low active):
+	 */
+	switch (mp_irqs[idx].mpc_irqflag & 3)
+	{
+		case 0: /* conforms, ie. bus-type dependent polarity */
+		{
+			switch (mp_bus_id_to_type[bus])
+			{
+				case MP_BUS_ISA: /* ISA pin */
+				{
+					polarity = default_ISA_polarity(idx);
+					break;
+				}
+				case MP_BUS_EISA: /* EISA pin */
+				{
+					polarity = default_EISA_polarity(idx);
+					break;
+				}
+				case MP_BUS_PCI: /* PCI pin */
+				{
+					polarity = default_PCI_polarity(idx);
+					break;
+				}
+				case MP_BUS_MCA: /* MCA pin */
+				{
+					polarity = default_MCA_polarity(idx);
+					break;
+				}
+				default:
+				{
+					printk(KERN_WARNING "broken BIOS!!\n");
+					polarity = 1;
+					break;
+				}
+			}
+			break;
+		}
+		case 1: /* high active */
+		{
+			polarity = 0;
+			break;
+		}
+		case 2: /* reserved */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			polarity = 1;
+			break;
+		}
+		case 3: /* low active */
+		{
+			polarity = 1;
+			break;
+		}
+		default: /* invalid */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			polarity = 1;
+			break;
+		}
+	}
+	return polarity;
+}
+
+static int MPBIOS_trigger(int idx)
+{
+	int bus = mp_irqs[idx].mpc_srcbus;
+	int trigger;
+
+	/*
+	 * Determine IRQ trigger mode (edge or level sensitive):
+	 */
+	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	{
+		case 0: /* conforms, ie. bus-type dependent */
+		{
+			switch (mp_bus_id_to_type[bus])
+			{
+				case MP_BUS_ISA: /* ISA pin */
+				{
+					trigger = default_ISA_trigger(idx);
+					break;
+				}
+				case MP_BUS_EISA: /* EISA pin */
+				{
+					trigger = default_EISA_trigger(idx);
+					break;
+				}
+				case MP_BUS_PCI: /* PCI pin */
+				{
+					trigger = default_PCI_trigger(idx);
+					break;
+				}
+				case MP_BUS_MCA: /* MCA pin */
+				{
+					trigger = default_MCA_trigger(idx);
+					break;
+				}
+				default:
+				{
+					printk(KERN_WARNING "broken BIOS!!\n");
+					trigger = 1;
+					break;
+				}
+			}
+			break;
+		}
+		case 1: /* edge */
+		{
+			trigger = 0;
+			break;
+		}
+		case 2: /* reserved */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			trigger = 1;
+			break;
+		}
+		case 3: /* level */
+		{
+			trigger = 1;
+			break;
+		}
+		default: /* invalid */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			trigger = 0;
+			break;
+		}
+	}
+	return trigger;
+}
+
+static inline int irq_polarity(int idx)
+{
+	return MPBIOS_polarity(idx);
+}
+
+static inline int irq_trigger(int idx)
+{
+	return MPBIOS_trigger(idx);
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+	int irq, i;
+	int bus = mp_irqs[idx].mpc_srcbus;
+
+	/*
+	 * Debugging check, we are in big trouble if this message pops up!
+	 */
+	if (mp_irqs[idx].mpc_dstirq != pin)
+		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+	switch (mp_bus_id_to_type[bus])
+	{
+		case MP_BUS_ISA: /* ISA pin */
+		case MP_BUS_EISA:
+		case MP_BUS_MCA:
+		{
+			irq = mp_irqs[idx].mpc_srcbusirq;
+			break;
+		}
+		case MP_BUS_PCI: /* PCI pin */
+		{
+			/*
+			 * PCI IRQs are mapped in order
+			 */
+			i = irq = 0;
+			while (i < apic)
+				irq += nr_ioapic_registers[i++];
+			irq += pin;
+			break;
+		}
+		default:
+		{
+			printk(KERN_ERR "unknown bus type %d.\n",bus); 
+			irq = 0;
+			break;
+		}
+	}
+
+	/*
+	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
+	 */
+	if ((pin >= 16) && (pin <= 23)) {
+		if (pirq_entries[pin-16] != -1) {
+			if (!pirq_entries[pin-16]) {
+				apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
+			} else {
+				irq = pirq_entries[pin-16];
+				apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
+						pin-16, irq);
+			}
+		}
+	}
+	return irq;
+}
+
+static inline int IO_APIC_irq_trigger(int irq)
+{
+	int apic, idx, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			idx = find_irq_entry(apic,pin,mp_INT);
+			if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+				return irq_trigger(idx);
+		}
+	}
+	/*
+	 * nonexistent IRQs are edge default
+	 */
+	return 0;
+}
+
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+
+int assign_irq_vector(int irq)
+{
+	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+
+	BUG_ON(irq >= NR_IRQ_VECTORS);
+	if (IO_APIC_VECTOR(irq) > 0)
+		return IO_APIC_VECTOR(irq);
+next:
+	current_vector += 8;
+	if (current_vector == IA32_SYSCALL_VECTOR)
+		goto next;
+
+	if (current_vector >= FIRST_SYSTEM_VECTOR) {
+		offset++;
+		if (!(offset%8))
+			return -ENOSPC;
+		current_vector = FIRST_DEVICE_VECTOR + offset;
+	}
+
+	vector_irq[current_vector] = irq;
+	if (irq != AUTO_ASSIGN)
+		IO_APIC_VECTOR(irq) = current_vector;
+
+	return current_vector;
+}
+
+extern void (*interrupt[NR_IRQS])(void);
+static struct hw_interrupt_type ioapic_level_type;
+static struct hw_interrupt_type ioapic_edge_type;
+
+#define IOAPIC_AUTO	-1
+#define IOAPIC_EDGE	0
+#define IOAPIC_LEVEL	1
+
+static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+{
+	if (use_pci_vector() && !platform_legacy_irq(irq)) {
+		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+				trigger == IOAPIC_LEVEL)
+			irq_desc[vector].handler = &ioapic_level_type;
+		else
+			irq_desc[vector].handler = &ioapic_edge_type;
+		set_intr_gate(vector, interrupt[vector]);
+	} else	{
+		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+				trigger == IOAPIC_LEVEL)
+			irq_desc[irq].handler = &ioapic_level_type;
+		else
+			irq_desc[irq].handler = &ioapic_edge_type;
+		set_intr_gate(vector, interrupt[irq]);
+	}
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+	struct IO_APIC_route_entry entry;
+	int apic, pin, idx, irq, first_notcon = 1, vector;
+	unsigned long flags;
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+		/*
+		 * add it to the IO-APIC irq-routing table:
+		 */
+		memset(&entry,0,sizeof(entry));
+
+		entry.delivery_mode = INT_DELIVERY_MODE;
+		entry.dest_mode = INT_DEST_MODE;
+		entry.mask = 0;				/* enable IRQ */
+		entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+		idx = find_irq_entry(apic,pin,mp_INT);
+		if (idx == -1) {
+			if (first_notcon) {
+				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				first_notcon = 0;
+			} else
+				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+			continue;
+		}
+
+		entry.trigger = irq_trigger(idx);
+		entry.polarity = irq_polarity(idx);
+
+		if (irq_trigger(idx)) {
+			entry.trigger = 1;
+			entry.mask = 1;
+			entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+		}
+
+		irq = pin_2_irq(idx, apic, pin);
+		add_pin_to_irq(irq, apic, pin);
+
+		if (!apic && !IO_APIC_IRQ(irq))
+			continue;
+
+		if (IO_APIC_IRQ(irq)) {
+			vector = assign_irq_vector(irq);
+			entry.vector = vector;
+
+			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+			if (!apic && (irq < 16))
+				disable_8259A_irq(irq);
+		}
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
+		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
+	}
+
+	if (!first_notcon)
+		apic_printk(APIC_VERBOSE," not connected.\n");
+}
+
+/*
+ * Set up the 8259A-master output pin as broadcast to all
+ * CPUs.
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
+{
+	struct IO_APIC_route_entry entry;
+	unsigned long flags;
+
+	memset(&entry,0,sizeof(entry));
+
+	disable_8259A_irq(0);
+
+	/* mask LVT0 */
+	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+
+	/*
+	 * We use logical delivery to get the timer IRQ
+	 * to the first CPU.
+	 */
+	entry.dest_mode = INT_DEST_MODE;
+	entry.mask = 0;					/* unmask IRQ now */
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.delivery_mode = INT_DELIVERY_MODE;
+	entry.polarity = 0;
+	entry.trigger = 0;
+	entry.vector = vector;
+
+	/*
+	 * The timer IRQ doesn't have to know that behind the
+	 * scene we have a 8259A-master in AEOI mode ...
+	 */
+	irq_desc[0].handler = &ioapic_edge_type;
+
+	/*
+	 * Add it to the IO-APIC irq-routing table:
+	 */
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+	io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	enable_8259A_irq(0);
+}
+
+void __init UNEXPECTED_IO_APIC(void)
+{
+}
+
+void __apicdebuginit print_IO_APIC(void)
+{
+	int apic, i;
+	union IO_APIC_reg_00 reg_00;
+	union IO_APIC_reg_01 reg_01;
+	union IO_APIC_reg_02 reg_02;
+	unsigned long flags;
+
+	if (apic_verbosity == APIC_QUIET)
+		return;
+
+	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+	for (i = 0; i < nr_ioapics; i++)
+		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+
+	/*
+	 * We are a bit conservative about what we expect.  We have to
+	 * know about every hardware change ASAP.
+	 */
+	printk(KERN_INFO "testing the IO APIC.......................\n");
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(apic, 0);
+	reg_01.raw = io_apic_read(apic, 1);
+	if (reg_01.bits.version >= 0x10)
+		reg_02.raw = io_apic_read(apic, 2);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	printk("\n");
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+	if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
+		UNEXPECTED_IO_APIC();
+
+	printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+	printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+	if (	(reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
+		(reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
+		(reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
+		(reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
+		(reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
+		(reg_01.bits.entries != 0x2E) &&
+		(reg_01.bits.entries != 0x3F) &&
+		(reg_01.bits.entries != 0x03) 
+	)
+		UNEXPECTED_IO_APIC();
+
+	printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+	printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+	if (	(reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
+		(reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
+		(reg_01.bits.version != 0x10) && /* oldest IO-APICs */
+		(reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
+		(reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
+		(reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
+	)
+		UNEXPECTED_IO_APIC();
+	if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
+		UNEXPECTED_IO_APIC();
+
+	if (reg_01.bits.version >= 0x10) {
+		printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+		printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+		if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
+			UNEXPECTED_IO_APIC();
+	}
+
+	printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+	printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
+			  " Stat Dest Deli Vect:   \n");
+
+	for (i = 0; i <= reg_01.bits.entries; i++) {
+		struct IO_APIC_route_entry entry;
+
+		spin_lock_irqsave(&ioapic_lock, flags);
+		*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
+		*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		printk(KERN_DEBUG " %02x %03X %02X  ",
+			i,
+			entry.dest.logical.logical_dest,
+			entry.dest.physical.physical_dest
+		);
+
+		printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+			entry.mask,
+			entry.trigger,
+			entry.irr,
+			entry.polarity,
+			entry.delivery_status,
+			entry.dest_mode,
+			entry.delivery_mode,
+			entry.vector
+		);
+	}
+	}
+	if (use_pci_vector())
+		printk(KERN_INFO "Using vector-based indexing\n");
+	printk(KERN_DEBUG "IRQ to pin mappings:\n");
+	for (i = 0; i < NR_IRQS; i++) {
+		struct irq_pin_list *entry = irq_2_pin + i;
+		if (entry->pin < 0)
+			continue;
+ 		if (use_pci_vector() && !platform_legacy_irq(i))
+			printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
+		else
+			printk(KERN_DEBUG "IRQ%d ", i);
+		for (;;) {
+			printk("-> %d:%d", entry->apic, entry->pin);
+			if (!entry->next)
+				break;
+			entry = irq_2_pin + entry->next;
+		}
+		printk("\n");
+	}
+
+	printk(KERN_INFO ".................................... done.\n");
+
+	return;
+}
+
+#if 0
+
+static __apicdebuginit void print_APIC_bitfield (int base)
+{
+	unsigned int v;
+	int i, j;
+
+	if (apic_verbosity == APIC_QUIET)
+		return;
+
+	printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+	for (i = 0; i < 8; i++) {
+		v = apic_read(base + i*0x10);
+		for (j = 0; j < 32; j++) {
+			if (v & (1<<j))
+				printk("1");
+			else
+				printk("0");
+		}
+		printk("\n");
+	}
+}
+
+void __apicdebuginit print_local_APIC(void * dummy)
+{
+	unsigned int v, ver, maxlvt;
+
+	if (apic_verbosity == APIC_QUIET)
+		return;
+
+	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
+	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+	v = apic_read(APIC_LVR);
+	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+	ver = GET_APIC_VERSION(v);
+	maxlvt = get_maxlvt();
+
+	v = apic_read(APIC_TASKPRI);
+	printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+	if (APIC_INTEGRATED(ver)) {			/* !82489DX */
+		v = apic_read(APIC_ARBPRI);
+		printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+			v & APIC_ARBPRI_MASK);
+		v = apic_read(APIC_PROCPRI);
+		printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+	}
+
+	v = apic_read(APIC_EOI);
+	printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+	v = apic_read(APIC_RRR);
+	printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+	v = apic_read(APIC_LDR);
+	printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+	v = apic_read(APIC_DFR);
+	printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+	v = apic_read(APIC_SPIV);
+	printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+	printk(KERN_DEBUG "... APIC ISR field:\n");
+	print_APIC_bitfield(APIC_ISR);
+	printk(KERN_DEBUG "... APIC TMR field:\n");
+	print_APIC_bitfield(APIC_TMR);
+	printk(KERN_DEBUG "... APIC IRR field:\n");
+	print_APIC_bitfield(APIC_IRR);
+
+	if (APIC_INTEGRATED(ver)) {		/* !82489DX */
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
+			apic_write(APIC_ESR, 0);
+		v = apic_read(APIC_ESR);
+		printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+	}
+
+	v = apic_read(APIC_ICR);
+	printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+	v = apic_read(APIC_ICR2);
+	printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+
+	v = apic_read(APIC_LVTT);
+	printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+	if (maxlvt > 3) {                       /* PC is LVT#4. */
+		v = apic_read(APIC_LVTPC);
+		printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+	}
+	v = apic_read(APIC_LVT0);
+	printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+	v = apic_read(APIC_LVT1);
+	printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+	if (maxlvt > 2) {			/* ERR is LVT#3. */
+		v = apic_read(APIC_LVTERR);
+		printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+	}
+
+	v = apic_read(APIC_TMICT);
+	printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+	v = apic_read(APIC_TMCCT);
+	printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+	v = apic_read(APIC_TDCR);
+	printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+	printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+	on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+
+void __apicdebuginit print_PIC(void)
+{
+	extern spinlock_t i8259A_lock;
+	unsigned int v;
+	unsigned long flags;
+
+	if (apic_verbosity == APIC_QUIET)
+		return;
+
+	printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	v = inb(0xa1) << 8 | inb(0x21);
+	printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+
+	v = inb(0xa0) << 8 | inb(0x20);
+	printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+
+	outb(0x0b,0xa0);
+	outb(0x0b,0x20);
+	v = inb(0xa0) << 8 | inb(0x20);
+	outb(0x0a,0xa0);
+	outb(0x0a,0x20);
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+
+	v = inb(0x4d1) << 8 | inb(0x4d0);
+	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+#endif  /*  0  */
+
+static void __init enable_IO_APIC(void)
+{
+	union IO_APIC_reg_01 reg_01;
+	int i;
+	unsigned long flags;
+
+	for (i = 0; i < PIN_MAP_SIZE; i++) {
+		irq_2_pin[i].pin = -1;
+		irq_2_pin[i].next = 0;
+	}
+	if (!pirqs_enabled)
+		for (i = 0; i < MAX_PIRQS; i++)
+			pirq_entries[i] = -1;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	for (i = 0; i < nr_ioapics; i++) {
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_01.raw = io_apic_read(i, 1);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		nr_ioapic_registers[i] = reg_01.bits.entries+1;
+	}
+
+	/*
+	 * Do not trust the IO-APIC being empty at bootup
+	 */
+	clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+	/*
+	 * Clear the IO-APIC before rebooting:
+	 */
+	clear_IO_APIC();
+
+	disconnect_bsp_APIC();
+}
+
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
+ */
+
+static void __init setup_ioapic_ids_from_mpc (void)
+{
+	union IO_APIC_reg_00 reg_00;
+	int apic;
+	int i;
+	unsigned char old_id;
+	unsigned long flags;
+
+	/*
+	 * Set the IOAPIC ID to the value stored in the MPC table.
+	 */
+	for (apic = 0; apic < nr_ioapics; apic++) {
+
+		/* Read the register 0 value */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_00.raw = io_apic_read(apic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		
+		old_id = mp_ioapics[apic].mpc_apicid;
+
+
+		printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
+
+
+		/*
+		 * We need to adjust the IRQ routing table
+		 * if the ID changed.
+		 */
+		if (old_id != mp_ioapics[apic].mpc_apicid)
+			for (i = 0; i < mp_irq_entries; i++)
+				if (mp_irqs[i].mpc_dstapic == old_id)
+					mp_irqs[i].mpc_dstapic
+						= mp_ioapics[apic].mpc_apicid;
+
+		/*
+		 * Read the right value from the MPC table and
+		 * write it into the ID register.
+	 	 */
+		apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
+				mp_ioapics[apic].mpc_apicid);
+
+		reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(apic, 0, reg_00.raw);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		/*
+		 * Sanity check
+		 */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_00.raw = io_apic_read(apic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+			printk("could not set ID!\n");
+		else
+			apic_printk(APIC_VERBOSE," ok.\n");
+	}
+}
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *	- timer IRQ defaults to IO-APIC IRQ
+ *	- if this function detects that timer IRQs are defunct, then we fall
+ *	  back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+	unsigned long t1 = jiffies;
+
+	local_irq_enable();
+	/* Let ten ticks pass... */
+	mdelay((10 * 1000) / HZ);
+
+	/*
+	 * Expect a few ticks at least, to be sure some possible
+	 * glue logic does not lock up after one or two first
+	 * ticks in a non-ExtINT mode.  Also the local APIC
+	 * might have cached one ExtINT interrupt.  Finally, at
+	 * least one tick may be lost due to delays.
+	 */
+
+	/* jiffies wrap? */
+	if (jiffies - t1 > 4)
+		return 1;
+	return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+
+static unsigned int startup_edge_ioapic_irq(unsigned int irq)
+{
+	int was_pending = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	if (irq < 16) {
+		disable_8259A_irq(irq);
+		if (i8259A_irq_pending(irq))
+			was_pending = 1;
+	}
+	__unmask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return was_pending;
+}
+
+/*
+ * Once we have recorded IRQ_PENDING already, we can mask the
+ * interrupt for real. This prevents IRQ storms from unhandled
+ * devices.
+ */
+static void ack_edge_ioapic_irq(unsigned int irq)
+{
+	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
+					== (IRQ_PENDING | IRQ_DISABLED))
+		mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+/*
+ * Level triggered interrupts can just be masked,
+ * and shutting down and starting up the interrupt
+ * is the same as enabling and disabling them -- except
+ * with a startup need to return a "was pending" value.
+ *
+ * Level triggered interrupts are special because we
+ * do not touch any IO-APIC register while handling
+ * them. We ack the APIC in the end-IRQ handler, not
+ * in the start-IRQ-handler. Protection against reentrance
+ * from the same interrupt is still provided, both by the
+ * generic IRQ layer and by the fact that an unacked local
+ * APIC does not accept IRQs.
+ */
+static unsigned int startup_level_ioapic_irq (unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+
+	return 0; /* don't check for pending */
+}
+
+static void end_level_ioapic_irq (unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	unsigned long flags;
+	unsigned int dest;
+
+	dest = cpu_mask_to_apicid(mask);
+
+	/*
+	 * Only the high 8 bits are valid.
+	 */
+	dest = SET_APIC_LOGICAL_ID(dest);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__DO_ACTION(1, = dest, )
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifdef CONFIG_PCI_MSI
+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	return startup_edge_ioapic_irq(irq);
+}
+
+static void ack_edge_ioapic_vector(unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	ack_edge_ioapic_irq(irq);
+}
+
+static unsigned int startup_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	return startup_level_ioapic_irq (irq);
+}
+
+static void end_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	end_level_ioapic_irq(irq);
+}
+
+static void mask_IO_APIC_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_IO_APIC_irq(irq);
+}
+
+static void unmask_IO_APIC_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	unmask_IO_APIC_irq(irq);
+}
+
+static void set_ioapic_affinity_vector (unsigned int vector,
+					cpumask_t cpu_mask)
+{
+	int irq = vector_to_irq(vector);
+
+	set_ioapic_affinity_irq(irq, cpu_mask);
+}
+#endif
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+
+static struct hw_interrupt_type ioapic_edge_type = {
+	.typename = "IO-APIC-edge",
+	.startup 	= startup_edge_ioapic,
+	.shutdown 	= shutdown_edge_ioapic,
+	.enable 	= enable_edge_ioapic,
+	.disable 	= disable_edge_ioapic,
+	.ack 		= ack_edge_ioapic,
+	.end 		= end_edge_ioapic,
+	.set_affinity = set_ioapic_affinity,
+};
+
+static struct hw_interrupt_type ioapic_level_type = {
+	.typename = "IO-APIC-level",
+	.startup 	= startup_level_ioapic,
+	.shutdown 	= shutdown_level_ioapic,
+	.enable 	= enable_level_ioapic,
+	.disable 	= disable_level_ioapic,
+	.ack 		= mask_and_ack_level_ioapic,
+	.end 		= end_level_ioapic,
+	.set_affinity = set_ioapic_affinity,
+};
+
+static inline void init_IO_APIC_traps(void)
+{
+	int irq;
+
+	/*
+	 * NOTE! The local APIC isn't very good at handling
+	 * multiple interrupts at the same interrupt level.
+	 * As the interrupt level is determined by taking the
+	 * vector number and shifting that right by 4, we
+	 * want to spread these out a bit so that they don't
+	 * all fall in the same interrupt level.
+	 *
+	 * Also, we've got to be careful not to trash gate
+	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
+	 */
+	for (irq = 0; irq < NR_IRQS ; irq++) {
+		int tmp = irq;
+		if (use_pci_vector()) {
+			if (!platform_legacy_irq(tmp))
+				if ((tmp = vector_to_irq(tmp)) == -1)
+					continue;
+		}
+		if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
+			/*
+			 * Hmm.. We don't have an entry for this,
+			 * so default to an old-fashioned 8259
+			 * interrupt if we can..
+			 */
+			if (irq < 16)
+				make_8259A_irq(irq);
+			else
+				/* Strange. Oh, well.. */
+				irq_desc[irq].handler = &no_irq_type;
+		}
+	}
+}
+
+static void enable_lapic_irq (unsigned int irq)
+{
+	unsigned long v;
+
+	v = apic_read(APIC_LVT0);
+	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void disable_lapic_irq (unsigned int irq)
+{
+	unsigned long v;
+
+	v = apic_read(APIC_LVT0);
+	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq (unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+static void end_lapic_irq (unsigned int i) { /* nothing */ }
+
+static struct hw_interrupt_type lapic_irq_type = {
+	.typename = "local-APIC-edge",
+	.startup = NULL, /* startup_irq() not used for IRQ0 */
+	.shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
+	.enable = enable_lapic_irq,
+	.disable = disable_lapic_irq,
+	.ack = ack_lapic_irq,
+	.end = end_lapic_irq,
+};
+
+static void setup_nmi (void)
+{
+	/*
+ 	 * Dirty trick to enable the NMI watchdog ...
+	 * We put the 8259A master into AEOI mode and
+	 * unmask on all local APICs LVT0 as NMI.
+	 *
+	 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+	 * is from Maciej W. Rozycki - so we do not have to EOI from
+	 * the NMI handler or the timer interrupt.
+	 */ 
+	printk(KERN_INFO "activating NMI Watchdog ...");
+
+	enable_NMI_through_LVT0(NULL);
+
+	printk(" done.\n");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+	int pin, i;
+	struct IO_APIC_route_entry entry0, entry1;
+	unsigned char save_control, save_freq_select;
+	unsigned long flags;
+
+	pin = find_isa_irq_pin(8, mp_INT);
+	if (pin == -1)
+		return;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	*(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
+	*(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	clear_IO_APIC_pin(0, pin);
+
+	memset(&entry1, 0, sizeof(entry1));
+
+	entry1.dest_mode = 0;			/* physical delivery */
+	entry1.mask = 0;			/* unmask IRQ now */
+	entry1.dest.physical.physical_dest = hard_smp_processor_id();
+	entry1.delivery_mode = dest_ExtINT;
+	entry1.polarity = entry0.polarity;
+	entry1.trigger = 0;
+	entry1.vector = 0;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
+	io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	save_control = CMOS_READ(RTC_CONTROL);
+	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+	CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+		   RTC_FREQ_SELECT);
+	CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+	i = 100;
+	while (i-- > 0) {
+		mdelay(10);
+		if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+			i -= 10;
+	}
+
+	CMOS_WRITE(save_control, RTC_CONTROL);
+	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+	clear_IO_APIC_pin(0, pin);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
+	io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ */
+static inline void check_timer(void)
+{
+	int pin1, pin2;
+	int vector;
+
+	/*
+	 * get/set the timer IRQ vector:
+	 */
+	disable_8259A_irq(0);
+	vector = assign_irq_vector(0);
+	set_intr_gate(vector, interrupt[0]);
+
+	/*
+	 * Subtle, code in do_timer_interrupt() expects an AEOI
+	 * mode for the 8259A whenever interrupts are routed
+	 * through I/O APICs.  Also IRQ0 has to be enabled in
+	 * the 8259A which implies the virtual wire has to be
+	 * disabled in the local APIC.
+	 */
+	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+	init_8259A(1);
+	enable_8259A_irq(0);
+
+	pin1 = find_isa_irq_pin(0, mp_INT);
+	pin2 = find_isa_irq_pin(0, mp_ExtINT);
+
+	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2);
+
+	if (pin1 != -1) {
+		/*
+		 * Ok, does IRQ0 through the IOAPIC work?
+		 */
+		unmask_IO_APIC_irq(0);
+		if (timer_irq_works()) {
+			nmi_watchdog_default();
+			if (nmi_watchdog == NMI_IO_APIC) {
+				disable_8259A_irq(0);
+				setup_nmi();
+				enable_8259A_irq(0);
+				check_nmi_watchdog();
+			}
+			return;
+		}
+		clear_IO_APIC_pin(0, pin1);
+		apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
+	}
+
+	apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
+	if (pin2 != -1) {
+		apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2);
+		/*
+		 * legacy devices should be connected to IO APIC #0
+		 */
+		setup_ExtINT_IRQ0_pin(pin2, vector);
+		if (timer_irq_works()) {
+			printk("works.\n");
+			nmi_watchdog_default();
+			if (nmi_watchdog == NMI_IO_APIC) {
+				setup_nmi();
+				check_nmi_watchdog();
+			}
+			return;
+		}
+		/*
+		 * Cleanup, just in case ...
+		 */
+		clear_IO_APIC_pin(0, pin2);
+	}
+	printk(" failed.\n");
+
+	if (nmi_watchdog) {
+		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+		nmi_watchdog = 0;
+	}
+
+	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+
+	disable_8259A_irq(0);
+	irq_desc[0].handler = &lapic_irq_type;
+	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
+	enable_8259A_irq(0);
+
+	if (timer_irq_works()) {
+		apic_printk(APIC_QUIET, " works.\n");
+		return;
+	}
+	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+	apic_printk(APIC_VERBOSE," failed.\n");
+
+	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+
+	init_8259A(0);
+	make_8259A_irq(0);
+	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+
+	unlock_ExtINT_logic();
+
+	if (timer_irq_works()) {
+		apic_printk(APIC_VERBOSE," works.\n");
+		return;
+	}
+	apic_printk(APIC_VERBOSE," failed :(.\n");
+	panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+}
+
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ *   Linux doesn't really care, as it's not actually used
+ *   for any interrupt handling anyway.
+ */
+#define PIC_IRQS	(1<<2)
+
+void __init setup_IO_APIC(void)
+{
+	enable_IO_APIC();
+
+	if (acpi_ioapic)
+		io_apic_irqs = ~0;	/* all IRQs go through IOAPIC */
+	else
+		io_apic_irqs = ~PIC_IRQS;
+
+	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+
+	/*
+	 * Set up the IO-APIC IRQ routing table.
+	 */
+	if (!acpi_ioapic)
+		setup_ioapic_ids_from_mpc();
+	sync_Arb_IDs();
+	setup_IO_APIC_irqs();
+	init_IO_APIC_traps();
+	check_timer();
+	if (!acpi_ioapic)
+		print_IO_APIC();
+}
+
+struct sysfs_ioapic_data {
+	struct sys_device dev;
+	struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+
+static int ioapic_suspend(struct sys_device *dev, u32 state)
+{
+	struct IO_APIC_route_entry *entry;
+	struct sysfs_ioapic_data *data;
+	unsigned long flags;
+	int i;
+
+	data = container_of(dev, struct sysfs_ioapic_data, dev);
+	entry = data->entry;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
+		*(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
+		*(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return 0;
+}
+
+static int ioapic_resume(struct sys_device *dev)
+{
+	struct IO_APIC_route_entry *entry;
+	struct sysfs_ioapic_data *data;
+	unsigned long flags;
+	union IO_APIC_reg_00 reg_00;
+	int i;
+
+	data = container_of(dev, struct sysfs_ioapic_data, dev);
+	entry = data->entry;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(dev->id, 0);
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+		io_apic_write(dev->id, 0, reg_00.raw);
+	}
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
+		io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
+		io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return 0;
+}
+
+static struct sysdev_class ioapic_sysdev_class = {
+	set_kset_name("ioapic"),
+	.suspend = ioapic_suspend,
+	.resume = ioapic_resume,
+};
+
+static int __init ioapic_init_sysfs(void)
+{
+	struct sys_device * dev;
+	int i, size, error = 0;
+
+	error = sysdev_class_register(&ioapic_sysdev_class);
+	if (error)
+		return error;
+
+	for (i = 0; i < nr_ioapics; i++ ) {
+		size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+			* sizeof(struct IO_APIC_route_entry);
+		mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+		if (!mp_ioapic_data[i]) {
+			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+			continue;
+		}
+		memset(mp_ioapic_data[i], 0, size);
+		dev = &mp_ioapic_data[i]->dev;
+		dev->id = i;
+		dev->cls = &ioapic_sysdev_class;
+		error = sysdev_register(dev);
+		if (error) {
+			kfree(mp_ioapic_data[i]);
+			mp_ioapic_data[i] = NULL;
+			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+			continue;
+		}
+	}
+
+	return 0;
+}
+
+device_initcall(ioapic_init_sysfs);
+
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI_BOOT
+
+#define IO_APIC_MAX_ID		0xFE
+
+int __init io_apic_get_unique_id (int ioapic, int apic_id)
+{
+	union IO_APIC_reg_00 reg_00;
+	static physid_mask_t apic_id_map;
+	unsigned long flags;
+	int i = 0;
+
+	/*
+	 * The P4 platform supports up to 256 APIC IDs on two separate APIC 
+	 * buses (one for LAPICs, one for IOAPICs), where predecessors only 
+	 * supports up to 16 on one shared APIC bus.
+	 * 
+	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+	 *      advantage of new APIC bus architecture.
+	 */
+
+	if (physids_empty(apic_id_map))
+		apic_id_map = phys_cpu_present_map;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(ioapic, 0);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	if (apic_id >= IO_APIC_MAX_ID) {
+		apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+			"%d\n", ioapic, apic_id, reg_00.bits.ID);
+		apic_id = reg_00.bits.ID;
+	}
+
+	/*
+	 * Every APIC in a system must have a unique ID or we get lots of nice 
+	 * 'stuck on smp_invalidate_needed IPI wait' messages.
+	 */
+	if (physid_isset(apic_id, apic_id_map)) {
+
+		for (i = 0; i < IO_APIC_MAX_ID; i++) {
+			if (!physid_isset(i, apic_id_map))
+				break;
+		}
+
+		if (i == IO_APIC_MAX_ID)
+			panic("Max apic_id exceeded!\n");
+
+		apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+			"trying %d\n", ioapic, apic_id, i);
+
+		apic_id = i;
+	} 
+
+	physid_set(apic_id, apic_id_map);
+
+	if (reg_00.bits.ID != apic_id) {
+		reg_00.bits.ID = apic_id;
+
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(ioapic, 0, reg_00.raw);
+		reg_00.raw = io_apic_read(ioapic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		/* Sanity check */
+		if (reg_00.bits.ID != apic_id)
+			panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
+	}
+
+	apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+	return apic_id;
+}
+
+
+int __init io_apic_get_version (int ioapic)
+{
+	union IO_APIC_reg_01	reg_01;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_01.raw = io_apic_read(ioapic, 1);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return reg_01.bits.version;
+}
+
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+	union IO_APIC_reg_01	reg_01;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_01.raw = io_apic_read(ioapic, 1);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return reg_01.bits.entries;
+}
+
+
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+{
+	struct IO_APIC_route_entry entry;
+	unsigned long flags;
+
+	if (!IO_APIC_IRQ(irq)) {
+		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+			ioapic);
+		return -EINVAL;
+	}
+
+	/*
+	 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
+	 * Note that we mask (disable) IRQs now -- these get enabled when the
+	 * corresponding device driver registers for this IRQ.
+	 */
+
+	memset(&entry,0,sizeof(entry));
+
+	entry.delivery_mode = INT_DELIVERY_MODE;
+	entry.dest_mode = INT_DEST_MODE;
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.trigger = edge_level;
+	entry.polarity = active_high_low;
+	entry.mask = 1;					 /* Disabled (masked) */
+
+	/*
+	 * IRQs < 16 are already in the irq_2_pin[] map
+	 */
+	if (irq >= 16)
+		add_pin_to_irq(irq, ioapic, pin);
+
+	entry.vector = assign_irq_vector(irq);
+
+	apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
+		"IRQ %d Mode:%i Active:%i)\n", ioapic, 
+	       mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+	       edge_level, active_high_low);
+
+	ioapic_register_intr(irq, entry.vector, edge_level);
+
+	if (!ioapic && (irq < 16))
+		disable_8259A_irq(irq);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
+	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return 0;
+}
+
+#endif /*CONFIG_ACPI_BOOT*/
+
+
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+void __init setup_ioapic_dest(void)
+{
+	int pin, ioapic, irq, irq_entry;
+
+	if (skip_ioapic_setup == 1)
+		return;
+
+	for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+		for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+			irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+			if (irq_entry == -1)
+				continue;
+			irq = pin_2_irq(irq_entry, ioapic, pin);
+			set_ioapic_affinity_irq(irq, TARGET_CPUS);
+		}
+
+	}
+}
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
new file mode 100644
index 000000000000..cde0e868efe3
--- /dev/null
+++ b/arch/x86_64/kernel/ioport.c
@@ -0,0 +1,117 @@
+/*
+ *	linux/arch/x86_64/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+{
+	int i;
+		if (new_value)
+		for (i = base; i < base + extent; i++) 
+			__set_bit(i, bitmap); 
+		else
+		for (i = base; i < base + extent; i++) 
+			clear_bit(i, bitmap); 
+}
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+	unsigned int i, max_long, bytes, bytes_updated;
+	struct thread_struct * t = &current->thread;
+	struct tss_struct * tss;
+	unsigned long *bitmap;
+
+	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+		return -EINVAL;
+	if (turn_on && !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	/*
+	 * If it's the first ioperm() call in this thread's lifetime, set the
+	 * IO bitmap up. ioperm() is much less timing critical than clone(),
+	 * this is why we delay this operation until now:
+	 */
+	if (!t->io_bitmap_ptr) {
+		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+		if (!bitmap)
+			return -ENOMEM;
+
+		memset(bitmap, 0xff, IO_BITMAP_BYTES);
+		t->io_bitmap_ptr = bitmap;
+	}
+
+	/*
+	 * do it in the per-thread copy and in the TSS ...
+	 *
+	 * Disable preemption via get_cpu() - we must not switch away
+	 * because the ->io_bitmap_max value must match the bitmap
+	 * contents:
+	 */
+	tss = &per_cpu(init_tss, get_cpu());
+
+	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+	/*
+	 * Search for a (possibly new) maximum. This is simple and stupid,
+	 * to keep it obviously correct:
+	 */
+	max_long = 0;
+	for (i = 0; i < IO_BITMAP_LONGS; i++)
+		if (t->io_bitmap_ptr[i] != ~0UL)
+			max_long = i;
+
+	bytes = (max_long + 1) * sizeof(long);
+	bytes_updated = max(bytes, t->io_bitmap_max);
+
+	t->io_bitmap_max = bytes;
+
+	/* Update the TSS: */
+	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+
+	put_cpu();
+
+	return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+	unsigned int old = (regs->eflags >> 12) & 3;
+
+	if (level > 3)
+		return -EINVAL;
+	/* Trying to gain more privileges? */
+	if (level > old) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+	}
+	regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
+	return 0;
+}
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
new file mode 100644
index 000000000000..62b112e4deb4
--- /dev/null
+++ b/arch/x86_64/kernel/irq.c
@@ -0,0 +1,108 @@
+/*
+ *	linux/arch/x86_64/kernel/irq.c
+ *
+ *	Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86_64-specific interrupt
+ * entry and irq statistics code. All the remaining irq logic is
+ * done by the generic kernel/irq/ code and in the
+ * x86_64-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/io_apic.h>
+
+atomic_t irq_err_count;
+#ifdef CONFIG_X86_IO_APIC
+#ifdef APIC_MISMATCH_DEBUG
+atomic_t irq_mis_count;
+#endif
+#endif
+
+/*
+ * Generic, controller-independent functions:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+	int i = *(loff_t *) v, j;
+	struct irqaction * action;
+	unsigned long flags;
+
+	if (i == 0) {
+		seq_printf(p, "           ");
+		for (j=0; j<NR_CPUS; j++)
+			if (cpu_online(j))
+				seq_printf(p, "CPU%d       ",j);
+		seq_putc(p, '\n');
+	}
+
+	if (i < NR_IRQS) {
+		spin_lock_irqsave(&irq_desc[i].lock, flags);
+		action = irq_desc[i].action;
+		if (!action) 
+			goto skip;
+		seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+		seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+		for (j=0; j<NR_CPUS; j++)
+			if (cpu_online(j))
+			seq_printf(p, "%10u ",
+				kstat_cpu(j).irqs[i]);
+#endif
+		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+
+		seq_printf(p, "  %s", action->name);
+		for (action=action->next; action; action = action->next)
+			seq_printf(p, ", %s", action->name);
+		seq_putc(p, '\n');
+skip:
+		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+	} else if (i == NR_IRQS) {
+		seq_printf(p, "NMI: ");
+		for (j = 0; j < NR_CPUS; j++)
+			if (cpu_online(j))
+				seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
+		seq_putc(p, '\n');
+#ifdef CONFIG_X86_LOCAL_APIC
+		seq_printf(p, "LOC: ");
+		for (j = 0; j < NR_CPUS; j++)
+			if (cpu_online(j))
+				seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
+		seq_putc(p, '\n');
+#endif
+		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#ifdef CONFIG_X86_IO_APIC
+#ifdef APIC_MISMATCH_DEBUG
+		seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+#endif
+	}
+	return 0;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+{	
+	/* high bits used in ret_from_ code  */
+	unsigned irq = regs->orig_rax & 0xff;
+
+	irq_enter();
+	BUG_ON(irq > 256);
+
+	__do_IRQ(irq, regs);
+	irq_exit();
+
+	return 1;
+}
+
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
new file mode 100644
index 000000000000..4f2a852299b6
--- /dev/null
+++ b/arch/x86_64/kernel/kprobes.c
@@ -0,0 +1,631 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/x86_64/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation ( includes contributions from
+ *		Rusty Russell).
+ * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *		interface to access function arguments.
+ * 2004-Oct	Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> adapted for x86_64
+ * 2005-Mar	Roland McGrath <roland@redhat.com>
+ *		Fixed to handle %rip-relative addressing mode correctly.
+ */
+
+#include <linux/config.h>
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/moduleloader.h>
+
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+
+static DECLARE_MUTEX(kprobe_mutex);
+
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE	0x00000001
+#define KPROBE_HIT_SS		0x00000002
+
+static struct kprobe *current_kprobe;
+static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
+static struct pt_regs jprobe_saved_regs;
+static long *jprobe_saved_rsp;
+static kprobe_opcode_t *get_insn_slot(void);
+static void free_insn_slot(kprobe_opcode_t *slot);
+void jprobe_return_end(void);
+
+/* copy of the kernel stack at the probe fire time */
+static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static inline int is_IF_modifier(kprobe_opcode_t *insn)
+{
+	switch (*insn) {
+	case 0xfa:		/* cli */
+	case 0xfb:		/* sti */
+	case 0xcf:		/* iret/iretd */
+	case 0x9d:		/* popf/popfd */
+		return 1;
+	}
+
+	if (*insn  >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
+		return 1;
+	return 0;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+	/* insn: must be on special executable page on x86_64. */
+	up(&kprobe_mutex);
+	p->ainsn.insn = get_insn_slot();
+	down(&kprobe_mutex);
+	if (!p->ainsn.insn) {
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/*
+ * Determine if the instruction uses the %rip-relative addressing mode.
+ * If it does, return the address of the 32-bit displacement word.
+ * If not, return null.
+ */
+static inline s32 *is_riprel(u8 *insn)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)		      \
+	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+	 << (row % 64))
+	static const u64 onebyte_has_modrm[256 / 64] = {
+		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+		/*      -------------------------------         */
+		W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
+		W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
+		W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
+		W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
+		W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
+		W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
+		W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
+		W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
+		W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
+		W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
+		W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
+		W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
+		W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
+		W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
+		W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
+		W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1)  /* f0 */
+		/*      -------------------------------         */
+		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+	};
+	static const u64 twobyte_has_modrm[256 / 64] = {
+		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+		/*      -------------------------------         */
+		W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
+		W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
+		W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
+		W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
+		W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
+		W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
+		W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
+		W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
+		W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
+		W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
+		W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
+		W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
+		W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
+		W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
+		W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
+		W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* ff */
+		/*      -------------------------------         */
+		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+	};
+#undef	W
+	int need_modrm;
+
+	/* Skip legacy instruction prefixes.  */
+	while (1) {
+		switch (*insn) {
+		case 0x66:
+		case 0x67:
+		case 0x2e:
+		case 0x3e:
+		case 0x26:
+		case 0x64:
+		case 0x65:
+		case 0x36:
+		case 0xf0:
+		case 0xf3:
+		case 0xf2:
+			++insn;
+			continue;
+		}
+		break;
+	}
+
+	/* Skip REX instruction prefix.  */
+	if ((*insn & 0xf0) == 0x40)
+		++insn;
+
+	if (*insn == 0x0f) {	/* Two-byte opcode.  */
+		++insn;
+		need_modrm = test_bit(*insn, twobyte_has_modrm);
+	} else {		/* One-byte opcode.  */
+		need_modrm = test_bit(*insn, onebyte_has_modrm);
+	}
+
+	if (need_modrm) {
+		u8 modrm = *++insn;
+		if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
+			/* Displacement follows ModRM byte.  */
+			return (s32 *) ++insn;
+		}
+	}
+
+	/* No %rip-relative addressing mode here.  */
+	return NULL;
+}
+
+void arch_copy_kprobe(struct kprobe *p)
+{
+	s32 *ripdisp;
+	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
+	ripdisp = is_riprel(p->ainsn.insn);
+	if (ripdisp) {
+		/*
+		 * The copied instruction uses the %rip-relative
+		 * addressing mode.  Adjust the displacement for the
+		 * difference between the original location of this
+		 * instruction and the location of the copy that will
+		 * actually be run.  The tricky bit here is making sure
+		 * that the sign extension happens correctly in this
+		 * calculation, since we need a signed 32-bit result to
+		 * be sign-extended to 64 bits when it's added to the
+		 * %rip value and yield the same 64-bit result that the
+		 * sign-extension of the original signed 32-bit
+		 * displacement would have given.
+		 */
+		s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
+		BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
+		*ripdisp = disp;
+	}
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+	up(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn);
+	down(&kprobe_mutex);
+}
+
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+	*p->addr = p->opcode;
+	regs->rip = (unsigned long)p->addr;
+}
+
+static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	regs->eflags |= TF_MASK;
+	regs->eflags &= ~IF_MASK;
+	/*single step inline if the instruction is an int3*/
+	if (p->opcode == BREAKPOINT_INSTRUCTION)
+		regs->rip = (unsigned long)p->addr;
+	else
+		regs->rip = (unsigned long)p->ainsn.insn;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+int kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+
+	/* We're in an interrupt, but this is clear and BUG()-safe. */
+	preempt_disable();
+
+	/* Check we're not actually recursing */
+	if (kprobe_running()) {
+		/* We *are* holding lock here, so this is safe.
+		   Disarm the probe we just hit, and ignore it. */
+		p = get_kprobe(addr);
+		if (p) {
+			if (kprobe_status == KPROBE_HIT_SS) {
+				regs->eflags &= ~TF_MASK;
+				regs->eflags |= kprobe_saved_rflags;
+				unlock_kprobes();
+				goto no_kprobe;
+			}
+			disarm_kprobe(p, regs);
+			ret = 1;
+		} else {
+			p = current_kprobe;
+			if (p->break_handler && p->break_handler(p, regs)) {
+				goto ss_probe;
+			}
+		}
+		/* If it's not ours, can't be delete race, (we hold lock). */
+		goto no_kprobe;
+	}
+
+	lock_kprobes();
+	p = get_kprobe(addr);
+	if (!p) {
+		unlock_kprobes();
+		if (*addr != BREAKPOINT_INSTRUCTION) {
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 */
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it */
+		goto no_kprobe;
+	}
+
+	kprobe_status = KPROBE_HIT_ACTIVE;
+	current_kprobe = p;
+	kprobe_saved_rflags = kprobe_old_rflags
+	    = (regs->eflags & (TF_MASK | IF_MASK));
+	if (is_IF_modifier(p->ainsn.insn))
+		kprobe_saved_rflags &= ~IF_MASK;
+
+	if (p->pre_handler && p->pre_handler(p, regs))
+		/* handler has already set things up, so skip ss setup */
+		return 1;
+
+ss_probe:
+	prepare_singlestep(p, regs);
+	kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new rip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed eflags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ */
+static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+{
+	unsigned long *tos = (unsigned long *)regs->rsp;
+	unsigned long next_rip = 0;
+	unsigned long copy_rip = (unsigned long)p->ainsn.insn;
+	unsigned long orig_rip = (unsigned long)p->addr;
+	kprobe_opcode_t *insn = p->ainsn.insn;
+
+	/*skip the REX prefix*/
+	if (*insn >= 0x40 && *insn <= 0x4f)
+		insn++;
+
+	switch (*insn) {
+	case 0x9c:		/* pushfl */
+		*tos &= ~(TF_MASK | IF_MASK);
+		*tos |= kprobe_old_rflags;
+		break;
+	case 0xe8:		/* call relative - Fix return addr */
+		*tos = orig_rip + (*tos - copy_rip);
+		break;
+	case 0xff:
+		if ((*insn & 0x30) == 0x10) {
+			/* call absolute, indirect */
+			/* Fix return addr; rip is correct. */
+			next_rip = regs->rip;
+			*tos = orig_rip + (*tos - copy_rip);
+		} else if (((*insn & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
+			   ((*insn & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
+			/* rip is correct. */
+			next_rip = regs->rip;
+		}
+		break;
+	case 0xea:		/* jmp absolute -- rip is correct */
+		next_rip = regs->rip;
+		break;
+	default:
+		break;
+	}
+
+	regs->eflags &= ~TF_MASK;
+	if (next_rip) {
+		regs->rip = next_rip;
+	} else {
+		regs->rip = orig_rip + (regs->rip - copy_rip);
+	}
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function.  And we hold kprobe lock.
+ */
+int post_kprobe_handler(struct pt_regs *regs)
+{
+	if (!kprobe_running())
+		return 0;
+
+	if (current_kprobe->post_handler)
+		current_kprobe->post_handler(current_kprobe, regs, 0);
+
+	resume_execution(current_kprobe, regs);
+	regs->eflags |= kprobe_saved_rflags;
+
+	unlock_kprobes();
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, eflags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (regs->eflags & TF_MASK)
+		return 0;
+
+	return 1;
+}
+
+/* Interrupts disabled, kprobe_lock held. */
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	if (current_kprobe->fault_handler
+	    && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+		return 1;
+
+	if (kprobe_status & KPROBE_HIT_SS) {
+		resume_execution(current_kprobe, regs);
+		regs->eflags |= kprobe_old_rflags;
+
+		unlock_kprobes();
+		preempt_enable_no_resched();
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+			     void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	switch (val) {
+	case DIE_INT3:
+		if (kprobe_handler(args->regs))
+			return NOTIFY_STOP;
+		break;
+	case DIE_DEBUG:
+		if (post_kprobe_handler(args->regs))
+			return NOTIFY_STOP;
+		break;
+	case DIE_GPF:
+		if (kprobe_running() &&
+		    kprobe_fault_handler(args->regs, args->trapnr))
+			return NOTIFY_STOP;
+		break;
+	case DIE_PAGE_FAULT:
+		if (kprobe_running() &&
+		    kprobe_fault_handler(args->regs, args->trapnr))
+			return NOTIFY_STOP;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+	unsigned long addr;
+
+	jprobe_saved_regs = *regs;
+	jprobe_saved_rsp = (long *) regs->rsp;
+	addr = (unsigned long)jprobe_saved_rsp;
+	/*
+	 * As Linus pointed out, gcc assumes that the callee
+	 * owns the argument space and could overwrite it, e.g.
+	 * tailcall optimization. So, to be absolutely safe
+	 * we also save and restore enough stack bytes to cover
+	 * the argument area.
+	 */
+	memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
+	regs->eflags &= ~IF_MASK;
+	regs->rip = (unsigned long)(jp->entry);
+	return 1;
+}
+
+void jprobe_return(void)
+{
+	preempt_enable_no_resched();
+	asm volatile ("       xchg   %%rbx,%%rsp     \n"
+		      "       int3			\n"
+		      "       .globl jprobe_return_end	\n"
+		      "       jprobe_return_end:	\n"
+		      "       nop			\n"::"b"
+		      (jprobe_saved_rsp):"memory");
+}
+
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	u8 *addr = (u8 *) (regs->rip - 1);
+	unsigned long stack_addr = (unsigned long)jprobe_saved_rsp;
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
+		if ((long *)regs->rsp != jprobe_saved_rsp) {
+			struct pt_regs *saved_regs =
+			    container_of(jprobe_saved_rsp, struct pt_regs, rsp);
+			printk("current rsp %p does not match saved rsp %p\n",
+			       (long *)regs->rsp, jprobe_saved_rsp);
+			printk("Saved registers for jprobe %p\n", jp);
+			show_registers(saved_regs);
+			printk("Current registers\n");
+			show_registers(regs);
+			BUG();
+		}
+		*regs = jprobe_saved_regs;
+		memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
+		       MIN_STACK_SIZE(stack_addr));
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped.
+ * By default on x86_64, pages we get from kmalloc or vmalloc are not
+ * executable.  Single-stepping an instruction on such a page yields an
+ * oops.  So instead of storing the instruction copies in their respective
+ * kprobe objects, we allocate a page, map it executable, and store all the
+ * instruction copies there.  (We can allocate additional pages if somebody
+ * inserts a huge number of probes.)  Each page can hold up to INSNS_PER_PAGE
+ * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t)
+ * bytes.
+ */
+#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t)))
+struct kprobe_insn_page {
+	struct hlist_node hlist;
+	kprobe_opcode_t *insns;		/* page of instruction slots */
+	char slot_used[INSNS_PER_PAGE];
+	int nused;
+};
+
+static struct hlist_head kprobe_insn_pages;
+
+/**
+ * get_insn_slot() - Find a slot on an executable page for an instruction.
+ * We allocate an executable page if there's no room on existing ones.
+ */
+static kprobe_opcode_t *get_insn_slot(void)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos;
+
+	hlist_for_each(pos, &kprobe_insn_pages) {
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->nused < INSNS_PER_PAGE) {
+			int i;
+			for (i = 0; i < INSNS_PER_PAGE; i++) {
+				if (!kip->slot_used[i]) {
+					kip->slot_used[i] = 1;
+					kip->nused++;
+					return kip->insns + (i*MAX_INSN_SIZE);
+				}
+			}
+			/* Surprise!  No unused slots.  Fix kip->nused. */
+			kip->nused = INSNS_PER_PAGE;
+		}
+	}
+
+	/* All out of space.  Need to allocate a new page. Use slot 0.*/
+	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+	if (!kip) {
+		return NULL;
+	}
+
+	/*
+	 * For the %rip-relative displacement fixups to be doable, we
+	 * need our instruction copy to be within +/- 2GB of any data it
+	 * might access via %rip.  That is, within 2GB of where the
+	 * kernel image and loaded module images reside.  So we allocate
+	 * a page in the module loading area.
+	 */
+	kip->insns = module_alloc(PAGE_SIZE);
+	if (!kip->insns) {
+		kfree(kip);
+		return NULL;
+	}
+	INIT_HLIST_NODE(&kip->hlist);
+	hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+	memset(kip->slot_used, 0, INSNS_PER_PAGE);
+	kip->slot_used[0] = 1;
+	kip->nused = 1;
+	return kip->insns;
+}
+
+/**
+ * free_insn_slot() - Free instruction slot obtained from get_insn_slot().
+ */
+static void free_insn_slot(kprobe_opcode_t *slot)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos;
+
+	hlist_for_each(pos, &kprobe_insn_pages) {
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->insns <= slot
+		    && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) {
+			int i = (slot - kip->insns) / MAX_INSN_SIZE;
+			kip->slot_used[i] = 0;
+			kip->nused--;
+			if (kip->nused == 0) {
+				/*
+				 * Page is no longer in use.  Free it unless
+				 * it's the last one.  We keep the last one
+				 * so as not to have to set it up again the
+				 * next time somebody inserts a probe.
+				 */
+				hlist_del(&kip->hlist);
+				if (hlist_empty(&kprobe_insn_pages)) {
+					INIT_HLIST_NODE(&kip->hlist);
+					hlist_add_head(&kip->hlist,
+						&kprobe_insn_pages);
+				} else {
+					module_free(NULL, kip->insns);
+					kfree(kip);
+				}
+			}
+			return;
+		}
+	}
+}
diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c
new file mode 100644
index 000000000000..d7e5d0cf4285
--- /dev/null
+++ b/arch/x86_64/kernel/ldt.c
@@ -0,0 +1,253 @@
+/*
+ * linux/arch/x86_64/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ * 
+ * This handles calls from both 32bit and 64bit mode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+	if (current->active_mm)
+               load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
+{
+	void *oldldt;
+	void *newldt;
+	unsigned oldsize;
+
+	if (mincount <= (unsigned)pc->size)
+		return 0;
+	oldsize = pc->size;
+	mincount = (mincount+511)&(~511);
+	if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+		newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+	else
+		newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+
+	if (!newldt)
+		return -ENOMEM;
+
+	if (oldsize)
+		memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+	oldldt = pc->ldt;
+	memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+	wmb();
+	pc->ldt = newldt;
+	wmb();
+	pc->size = mincount;
+	wmb();
+	if (reload) {
+#ifdef CONFIG_SMP
+		cpumask_t mask;
+
+		preempt_disable();
+		mask = cpumask_of_cpu(smp_processor_id());
+		load_LDT(pc);
+		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+			smp_call_function(flush_ldt, NULL, 1, 1);
+		preempt_enable();
+#else
+		load_LDT(pc);
+#endif
+	}
+	if (oldsize) {
+		if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(oldldt);
+		else
+			kfree(oldldt);
+	}
+	return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+	int err = alloc_ldt(new, old->size, 0);
+	if (err < 0)
+		return err;
+	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+	return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct mm_struct * old_mm;
+	int retval = 0;
+
+	init_MUTEX(&mm->context.sem);
+	mm->context.size = 0;
+	old_mm = current->mm;
+	if (old_mm && old_mm->context.size > 0) {
+		down(&old_mm->context.sem);
+		retval = copy_ldt(&mm->context, &old_mm->context);
+		up(&old_mm->context.sem);
+	}
+	return retval;
+}
+
+/*
+ * 
+ * Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	if (mm->context.size) {
+		if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(mm->context.ldt);
+		else
+			kfree(mm->context.ldt);
+		mm->context.size = 0;
+	}
+}
+
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+	int err;
+	unsigned long size;
+	struct mm_struct * mm = current->mm;
+
+	if (!mm->context.size)
+		return 0;
+	if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+		bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+
+	down(&mm->context.sem);
+	size = mm->context.size*LDT_ENTRY_SIZE;
+	if (size > bytecount)
+		size = bytecount;
+
+	err = 0;
+	if (copy_to_user(ptr, mm->context.ldt, size))
+		err = -EFAULT;
+	up(&mm->context.sem);
+	if (err < 0)
+		goto error_return;
+	if (size != bytecount) {
+		/* zero-fill the rest */
+		if (clear_user(ptr+size, bytecount-size) != 0) {
+			err = -EFAULT;
+			goto error_return;
+		}
+	}
+	return bytecount;
+error_return:
+	return err;
+}
+
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+	/* Arbitrary number */ 
+	/* x86-64 default LDT is all zeros */
+	if (bytecount > 128) 
+		bytecount = 128; 	
+	if (clear_user(ptr, bytecount))
+		return -EFAULT;
+	return bytecount; 
+}
+
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+	struct task_struct *me = current;
+	struct mm_struct * mm = me->mm;
+	__u32 entry_1, entry_2, *lp;
+	int error;
+	struct user_desc ldt_info;
+
+	error = -EINVAL;
+
+	if (bytecount != sizeof(ldt_info))
+		goto out;
+	error = -EFAULT; 	
+	if (copy_from_user(&ldt_info, ptr, bytecount))
+		goto out;
+
+	error = -EINVAL;
+	if (ldt_info.entry_number >= LDT_ENTRIES)
+		goto out;
+	if (ldt_info.contents == 3) {
+		if (oldmode)
+			goto out;
+		if (ldt_info.seg_not_present == 0)
+			goto out;
+	}
+
+	down(&mm->context.sem);
+	if (ldt_info.entry_number >= (unsigned)mm->context.size) {
+		error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+		if (error < 0)
+			goto out_unlock;
+	}
+
+	lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
+
+   	/* Allow LDTs to be cleared by the user. */
+   	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+		if (oldmode || LDT_empty(&ldt_info)) {
+			entry_1 = 0;
+			entry_2 = 0;
+			goto install;
+		}
+	}
+
+	entry_1 = LDT_entry_a(&ldt_info);
+	entry_2 = LDT_entry_b(&ldt_info);
+	if (oldmode)
+		entry_2 &= ~(1 << 20);
+
+	/* Install the new entry ...  */
+install:
+	*lp	= entry_1;
+	*(lp+1)	= entry_2;
+	error = 0;
+
+out_unlock:
+	up(&mm->context.sem);
+out:
+	return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+{
+	int ret = -ENOSYS;
+
+	switch (func) {
+	case 0:
+		ret = read_ldt(ptr, bytecount);
+		break;
+	case 1:
+		ret = write_ldt(ptr, bytecount, 1);
+		break;
+	case 2:
+		ret = read_default_ldt(ptr, bytecount);
+		break;
+	case 0x11:
+		ret = write_ldt(ptr, bytecount, 0);
+		break;
+	}
+	return ret;
+}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
new file mode 100644
index 000000000000..86f9fd85016a
--- /dev/null
+++ b/arch/x86_64/kernel/mce.c
@@ -0,0 +1,548 @@
+/*
+ * Machine check handler.
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s). 
+ * 2004 Andi Kleen. Rewrote most of it. 
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/sysdev.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <asm/processor.h> 
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/kdebug.h>
+#include <asm/uaccess.h>
+
+#define MISC_MCELOG_MINOR 227
+#define NR_BANKS 5
+
+static int mce_dont_init;
+
+/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
+   3: never panic or exit (for testing only) */
+static int tolerant = 1;
+static int banks;
+static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long console_logged;
+static int notify_user;
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+struct mce_log mcelog = { 
+	MCE_LOG_SIGNATURE,
+	MCE_LOG_LEN,
+}; 
+
+void mce_log(struct mce *mce)
+{
+	unsigned next, entry;
+	mce->finished = 0;
+	smp_wmb();
+	for (;;) {
+		entry = rcu_dereference(mcelog.next);
+		/* When the buffer fills up discard new entries. Assume 
+		   that the earlier errors are the more interesting. */
+		if (entry >= MCE_LOG_LEN) {
+			set_bit(MCE_OVERFLOW, &mcelog.flags);
+			return;
+		}
+		/* Old left over entry. Skip. */
+		if (mcelog.entry[entry].finished)
+			continue;
+		smp_rmb();
+		next = entry + 1;
+		if (cmpxchg(&mcelog.next, entry, next) == entry)
+			break;
+	}
+	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+	smp_wmb();
+	mcelog.entry[entry].finished = 1;
+	smp_wmb();
+
+	if (!test_and_set_bit(0, &console_logged))
+		notify_user = 1;
+}
+
+static void print_mce(struct mce *m)
+{
+	printk(KERN_EMERG "\n"
+	       KERN_EMERG
+	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+	       m->cpu, m->mcgstatus, m->bank, m->status);
+	if (m->rip) {
+		printk(KERN_EMERG 
+		       "RIP%s %02x:<%016Lx> ",
+		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+		       m->cs, m->rip);
+		if (m->cs == __KERNEL_CS)
+			print_symbol("{%s}", m->rip);
+		printk("\n");
+	}
+	printk(KERN_EMERG "TSC %Lx ", m->tsc); 
+	if (m->addr)
+		printk("ADDR %Lx ", m->addr);
+	if (m->misc)
+		printk("MISC %Lx ", m->misc); 	
+	printk("\n");
+}
+
+static void mce_panic(char *msg, struct mce *backup, unsigned long start)
+{ 
+	int i;
+	oops_begin();
+	for (i = 0; i < MCE_LOG_LEN; i++) {
+		unsigned long tsc = mcelog.entry[i].tsc;
+		if (time_before(tsc, start))
+			continue;
+		print_mce(&mcelog.entry[i]); 
+		if (backup && mcelog.entry[i].tsc == backup->tsc)
+			backup = NULL;
+	}
+	if (backup)
+		print_mce(backup);
+	if (tolerant >= 3)
+		printk("Fake panic: %s\n", msg);
+	else
+		panic(msg);
+} 
+
+static int mce_available(struct cpuinfo_x86 *c)
+{
+	return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
+	       test_bit(X86_FEATURE_MCA, &c->x86_capability);
+}
+
+/* 
+ * The actual machine check handler
+ */
+
+void do_machine_check(struct pt_regs * regs, long error_code)
+{
+	struct mce m, panicm;
+	int nowayout = (tolerant < 1); 
+	int kill_it = 0;
+	u64 mcestart = 0;
+	int i;
+	int panicm_found = 0;
+
+	if (regs)
+		notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
+	if (!banks)
+		return;
+
+	memset(&m, 0, sizeof(struct mce));
+	m.cpu = hard_smp_processor_id();
+	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	if (!(m.mcgstatus & MCG_STATUS_RIPV))
+		kill_it = 1;
+	
+	rdtscll(mcestart);
+	barrier();
+
+	for (i = 0; i < banks; i++) {
+		if (!bank[i])
+			continue;
+		
+		m.misc = 0; 
+		m.addr = 0;
+		m.bank = i;
+		m.tsc = 0;
+
+		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+		if ((m.status & MCI_STATUS_VAL) == 0)
+			continue;
+
+		if (m.status & MCI_STATUS_EN) {
+			/* In theory _OVER could be a nowayout too, but
+			   assume any overflowed errors were no fatal. */
+			nowayout |= !!(m.status & MCI_STATUS_PCC);
+			kill_it |= !!(m.status & MCI_STATUS_UC);
+		}
+
+		if (m.status & MCI_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+		if (m.status & MCI_STATUS_ADDRV)
+			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+		if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) {
+			m.rip = regs->rip;
+			m.cs = regs->cs;
+		} else {
+			m.rip = 0;
+			m.cs = 0;
+		}
+
+		if (error_code != -1)
+			rdtscll(m.tsc);
+		wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
+		mce_log(&m);
+
+		/* Did this bank cause the exception? */
+		/* Assume that the bank with uncorrectable errors did it,
+		   and that there is only a single one. */
+		if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
+			panicm = m;
+			panicm_found = 1;
+		}
+
+		tainted |= TAINT_MACHINE_CHECK;
+	}
+
+	/* Never do anything final in the polling timer */
+	if (!regs)
+		goto out;
+
+	/* If we didn't find an uncorrectable error, pick
+	   the last one (shouldn't happen, just being safe). */
+	if (!panicm_found)
+		panicm = m;
+	if (nowayout)
+		mce_panic("Machine check", &panicm, mcestart);
+	if (kill_it) {
+		int user_space = 0;
+
+		if (m.mcgstatus & MCG_STATUS_RIPV)
+			user_space = panicm.rip && (panicm.cs & 3);
+		
+		/* When the machine was in user space and the CPU didn't get
+		   confused it's normally not necessary to panic, unless you 
+		   are paranoid (tolerant == 0)
+
+		   RED-PEN could be more tolerant for MCEs in idle,
+		   but most likely they occur at boot anyways, where
+		   it is best to just halt the machine. */
+		if ((!user_space && (panic_on_oops || tolerant < 2)) ||
+		    (unsigned)current->pid <= 1)
+			mce_panic("Uncorrected machine check", &panicm, mcestart);
+
+		/* do_exit takes an awful lot of locks and has as
+		   slight risk of deadlocking. If you don't want that
+		   don't set tolerant >= 2 */
+		if (tolerant < 3)
+			do_exit(SIGBUS);
+	}
+
+ out:
+	/* Last thing done in the machine check exception to clear state. */
+	wrmsrl(MSR_IA32_MCG_STATUS, 0);
+}
+
+/*
+ * Periodic polling timer for "silent" machine check errors.
+ */
+
+static int check_interval = 5 * 60; /* 5 minutes */
+static void mcheck_timer(void *data);
+static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
+
+static void mcheck_check_cpu(void *info)
+{
+	if (mce_available(&current_cpu_data))
+		do_machine_check(NULL, 0);
+}
+
+static void mcheck_timer(void *data)
+{
+	on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+	schedule_delayed_work(&mcheck_work, check_interval * HZ);
+
+	/*
+	 * It's ok to read stale data here for notify_user and
+	 * console_logged as we'll simply get the updated versions
+	 * on the next mcheck_timer execution and atomic operations
+	 * on console_logged act as synchronization for notify_user
+	 * writes.
+	 */
+	if (notify_user && console_logged) {
+		notify_user = 0;
+		clear_bit(0, &console_logged);
+		printk(KERN_INFO "Machine check events logged\n");
+	}
+}
+
+
+static __init int periodic_mcheck_init(void)
+{ 
+	if (check_interval)
+		schedule_delayed_work(&mcheck_work, check_interval*HZ);
+	return 0;
+} 
+__initcall(periodic_mcheck_init);
+
+
+/* 
+ * Initialize Machine Checks for a CPU.
+ */
+static void mce_init(void *dummy)
+{
+	u64 cap;
+	int i;
+
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
+	banks = cap & 0xff;
+	if (banks > NR_BANKS) { 
+		printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
+		banks = NR_BANKS; 
+	}
+
+	/* Log the machine checks left over from the previous reset.
+	   This also clears all registers */
+	do_machine_check(NULL, -1);
+
+	set_in_cr4(X86_CR4_MCE);
+
+	if (cap & MCG_CTL_P)
+		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+	for (i = 0; i < banks; i++) {
+		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}	
+}
+
+/* Add per CPU specific workarounds here */
+static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) 
+{ 
+	/* This should be disabled by the BIOS, but isn't always */
+	if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
+		/* disable GART TBL walk error reporting, which trips off 
+		   incorrectly with the IOMMU & 3ware & Cerberus. */
+		clear_bit(10, &bank[4]);
+	}
+}			
+
+static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+{
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		mce_intel_feature_init(c);
+		break;
+	default:
+		break;
+	}
+}
+
+/* 
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off. 
+ */
+void __init mcheck_init(struct cpuinfo_x86 *c)
+{
+	static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
+
+	mce_cpu_quirks(c); 
+
+	if (mce_dont_init ||
+	    cpu_test_and_set(smp_processor_id(), mce_cpus) ||
+	    !mce_available(c))
+		return;
+
+	mce_init(NULL);
+	mce_cpu_features(c);
+}
+
+/*
+ * Character device to read and clear the MCE log.
+ */
+
+static void collect_tscs(void *data) 
+{ 
+	unsigned long *cpu_tsc = (unsigned long *)data;
+	rdtscll(cpu_tsc[smp_processor_id()]);
+} 
+
+static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
+{
+	unsigned long cpu_tsc[NR_CPUS];
+	static DECLARE_MUTEX(mce_read_sem);
+	unsigned next;
+	char __user *buf = ubuf;
+	int i, err;
+
+	down(&mce_read_sem); 
+	next = rcu_dereference(mcelog.next);
+
+	/* Only supports full reads right now */
+	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 
+		up(&mce_read_sem);
+		return -EINVAL;
+	}
+
+	err = 0;
+	for (i = 0; i < next; i++) {
+		if (!mcelog.entry[i].finished)
+			continue;
+		smp_rmb();
+		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
+		buf += sizeof(struct mce); 
+	} 
+
+	memset(mcelog.entry, 0, next * sizeof(struct mce));
+	mcelog.next = 0;
+
+	synchronize_kernel();	
+
+	/* Collect entries that were still getting written before the synchronize. */
+
+	on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+	for (i = next; i < MCE_LOG_LEN; i++) { 
+		if (mcelog.entry[i].finished && 
+		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {  
+			err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
+			smp_rmb();
+			buf += sizeof(struct mce);
+			memset(&mcelog.entry[i], 0, sizeof(struct mce));
+		}
+	} 	
+	up(&mce_read_sem);
+	return err ? -EFAULT : buf - ubuf; 
+}
+
+static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
+{
+	int __user *p = (int __user *)arg;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM; 
+	switch (cmd) {
+	case MCE_GET_RECORD_LEN: 
+		return put_user(sizeof(struct mce), p);
+	case MCE_GET_LOG_LEN:
+		return put_user(MCE_LOG_LEN, p);		
+	case MCE_GETCLEAR_FLAGS: {
+		unsigned flags;
+		do { 
+			flags = mcelog.flags;
+		} while (cmpxchg(&mcelog.flags, flags, 0) != flags); 
+		return put_user(flags, p); 
+	}
+	default:
+		return -ENOTTY; 
+	} 
+}
+
+static struct file_operations mce_chrdev_ops = {
+	.read = mce_read,
+	.ioctl = mce_ioctl,
+};
+
+static struct miscdevice mce_log_device = {
+	MISC_MCELOG_MINOR,
+	"mcelog",
+	&mce_chrdev_ops,
+};
+
+/* 
+ * Old style boot options parsing. Only for compatibility. 
+ */
+
+static int __init mcheck_disable(char *str)
+{
+	mce_dont_init = 1;
+	return 0;
+}
+
+/* mce=off disables machine check. Note you can reenable it later
+   using sysfs */
+static int __init mcheck_enable(char *str)
+{
+	if (!strcmp(str, "off"))
+		mce_dont_init = 1;
+	else
+		printk("mce= argument %s ignored. Please use /sys", str); 
+	return 0;
+}
+
+__setup("nomce", mcheck_disable);
+__setup("mce", mcheck_enable);
+
+/* 
+ * Sysfs support
+ */ 
+
+/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
+static int mce_resume(struct sys_device *dev)
+{
+	on_each_cpu(mce_init, NULL, 1, 1);
+	return 0;
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void) 
+{ 
+	if (check_interval)
+		cancel_delayed_work(&mcheck_work);
+	/* Timer race is harmless here */
+	on_each_cpu(mce_init, NULL, 1, 1);       
+	if (check_interval)
+		schedule_delayed_work(&mcheck_work, check_interval*HZ);
+}
+
+static struct sysdev_class mce_sysclass = {
+	.resume = mce_resume,
+	set_kset_name("machinecheck"),
+};
+
+static struct sys_device device_mce = {
+	.id	= 0,
+	.cls	= &mce_sysclass,
+};
+
+/* Why are there no generic functions for this? */
+#define ACCESSOR(name, var, start) \
+	static ssize_t show_ ## name(struct sys_device *s, char *buf) { 	   	   \
+		return sprintf(buf, "%lx\n", (unsigned long)var);		   \
+	} 									   \
+	static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
+		char *end; 							   \
+		unsigned long new = simple_strtoul(buf, &end, 0); 		   \
+		if (end == buf) return -EINVAL;					   \
+		var = new;							   \
+		start; 								   \
+		return end-buf;		     					   \
+	}									   \
+	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+
+ACCESSOR(bank0ctl,bank[0],mce_restart())
+ACCESSOR(bank1ctl,bank[1],mce_restart())
+ACCESSOR(bank2ctl,bank[2],mce_restart())
+ACCESSOR(bank3ctl,bank[3],mce_restart())
+ACCESSOR(bank4ctl,bank[4],mce_restart())
+ACCESSOR(tolerant,tolerant,)
+ACCESSOR(check_interval,check_interval,mce_restart())
+
+static __init int mce_init_device(void)
+{
+	int err;
+	if (!mce_available(&boot_cpu_data))
+		return -EIO;
+	err = sysdev_class_register(&mce_sysclass);
+	if (!err)
+		err = sysdev_register(&device_mce);
+	if (!err) { 
+		/* could create per CPU objects, but it is not worth it. */
+		sysdev_create_file(&device_mce, &attr_bank0ctl); 
+		sysdev_create_file(&device_mce, &attr_bank1ctl); 
+		sysdev_create_file(&device_mce, &attr_bank2ctl); 
+		sysdev_create_file(&device_mce, &attr_bank3ctl); 
+		sysdev_create_file(&device_mce, &attr_bank4ctl); 
+		sysdev_create_file(&device_mce, &attr_tolerant); 
+		sysdev_create_file(&device_mce, &attr_check_interval);
+	} 
+	
+	misc_register(&mce_log_device);
+	return err;
+
+}
+device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
new file mode 100644
index 000000000000..4db9a640069f
--- /dev/null
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -0,0 +1,99 @@
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+
+static DEFINE_PER_CPU(unsigned long, next_check);
+
+asmlinkage void smp_thermal_interrupt(void)
+{
+	struct mce m;
+
+	ack_APIC_irq();
+
+	irq_enter();
+	if (time_before(jiffies, __get_cpu_var(next_check)))
+		goto done;
+
+	__get_cpu_var(next_check) = jiffies + HZ*300;
+	memset(&m, 0, sizeof(m));
+	m.cpu = smp_processor_id();
+	m.bank = MCE_THERMAL_BANK;
+	rdtscll(m.tsc);
+	rdmsrl(MSR_IA32_THERM_STATUS, m.status);
+	if (m.status & 0x1) {
+		printk(KERN_EMERG
+			"CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
+		add_taint(TAINT_MACHINE_CHECK);
+	} else {
+		printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
+	}
+
+	mce_log(&m);
+done:
+	irq_exit();
+}
+
+static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+{
+	u32 l, h;
+	int tm2 = 0;
+	unsigned int cpu = smp_processor_id();
+
+	if (!cpu_has(c, X86_FEATURE_ACPI))
+		return;
+
+	if (!cpu_has(c, X86_FEATURE_ACC))
+		return;
+
+	/* first check if TM1 is already enabled by the BIOS, in which
+	 * case there might be some SMM goo which handles it, so we can't even
+	 * put a handler since it might be delivered via SMI already.
+	 */
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+	h = apic_read(APIC_LVTTHMR);
+	if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
+		printk(KERN_DEBUG
+		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+		return;
+	}
+
+	if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+		tm2 = 1;
+
+	if (h & APIC_VECTOR_MASK) {
+		printk(KERN_DEBUG
+		       "CPU%d: Thermal LVT vector (%#x) already "
+		       "installed\n", cpu, (h & APIC_VECTOR_MASK));
+		return;
+	}
+
+	h = THERMAL_APIC_VECTOR;
+	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+	apic_write_around(APIC_LVTTHMR, h);
+
+	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
+
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+	wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
+
+	l = apic_read(APIC_LVTTHMR);
+	apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+		cpu, tm2 ? "TM2" : "TM1");
+	return;
+}
+
+void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+	intel_init_thermal(c);
+}
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
new file mode 100644
index 000000000000..c2ffea8845ed
--- /dev/null
+++ b/arch/x86_64/kernel/module.c
@@ -0,0 +1,166 @@
+/*  Kernel module help for x86-64
+    Copyright (C) 2001 Rusty Russell.
+    Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#define DEBUGP(fmt...) 
+
+void module_free(struct module *mod, void *module_region)
+{
+	vfree(module_region);
+}
+
+void *module_alloc(unsigned long size)
+{
+	struct vm_struct *area;
+
+	if (!size)
+		return NULL;
+	size = PAGE_ALIGN(size);
+	if (size > MODULES_LEN)
+		return NULL;
+
+	area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
+	if (!area)
+		return NULL;
+
+	return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+}
+
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+			      Elf_Shdr *sechdrs,
+			      char *secstrings,
+			      struct module *mod)
+{
+	return 0;
+}
+
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+		   const char *strtab,
+		   unsigned int symindex,
+		   unsigned int relsec,
+		   struct module *me)
+{
+	unsigned int i;
+	Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+	Elf64_Sym *sym;
+	void *loc;
+	u64 val; 
+
+	DEBUGP("Applying relocate section %u to %u\n", relsec,
+	       sechdrs[relsec].sh_info);
+	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+		/* This is where to make the change */
+		loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+			+ rel[i].r_offset;
+
+		/* This is the symbol it is referring to.  Note that all
+		   undefined symbols have been resolved.  */
+		sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+			+ ELF64_R_SYM(rel[i].r_info);
+
+	        DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+		       (int)ELF64_R_TYPE(rel[i].r_info), 
+		       sym->st_value, rel[i].r_addend, (u64)loc);
+
+		val = sym->st_value + rel[i].r_addend; 
+
+		switch (ELF64_R_TYPE(rel[i].r_info)) {
+		case R_X86_64_NONE:
+			break;
+		case R_X86_64_64:
+			*(u64 *)loc = val;
+			break;
+		case R_X86_64_32:
+			*(u32 *)loc = val;
+			if (val != *(u32 *)loc)
+				goto overflow;
+			break;
+		case R_X86_64_32S:
+			*(s32 *)loc = val;
+			if ((s64)val != *(s32 *)loc)
+				goto overflow;
+			break;
+		case R_X86_64_PC32: 
+			val -= (u64)loc;
+			*(u32 *)loc = val;
+#if 0
+			if ((s64)val != *(s32 *)loc)
+				goto overflow; 
+#endif
+			break;
+		default:
+			printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
+			       me->name, ELF64_R_TYPE(rel[i].r_info));
+			return -ENOEXEC;
+		}
+	}
+	return 0;
+
+overflow:
+	printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 
+	       (int)ELF64_R_TYPE(rel[i].r_info), val);
+	printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
+	       me->name);
+	return -ENOEXEC;
+}
+
+int apply_relocate(Elf_Shdr *sechdrs,
+		   const char *strtab,
+		   unsigned int symindex,
+		   unsigned int relsec,
+		   struct module *me)
+{
+	printk("non add relocation not supported\n");
+	return -ENOSYS;
+} 
+
+extern void apply_alternatives(void *start, void *end); 
+
+int module_finalize(const Elf_Ehdr *hdr,
+		    const Elf_Shdr *sechdrs,
+		    struct module *me)
+{
+	const Elf_Shdr *s;
+	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+	/* look for .altinstructions to patch */ 
+	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
+		void *seg; 		
+		if (strcmp(".altinstructions", secstrings + s->sh_name))
+			continue;
+		seg = (void *)s->sh_addr; 
+		apply_alternatives(seg, seg + s->sh_size); 
+	} 	
+	return 0;
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+}
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
new file mode 100644
index 000000000000..7ec031c6ca10
--- /dev/null
+++ b/arch/x86_64/kernel/mpparse.c
@@ -0,0 +1,949 @@
+/*
+ *	Intel Multiprocessor Specification 1.1 and 1.4
+ *	compliant MP-table parsing routines.
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *	Fixes
+ *		Erich Boleyn	:	MP v1.4 and additional changes.
+ *		Alan Cox	:	Added EBDA scanning
+ *		Ingo Molnar	:	various cleanups and rewrites
+ *		Maciej W. Rozycki:	Bits for default MP configurations
+ *		Paul Diefenbaugh:	Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/config.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+
+/* Have we found an MP table */
+int smp_found_config;
+unsigned int __initdata maxcpus = NR_CPUS;
+
+int acpi_found_madt;
+
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+int apic_version [MAX_APICS];
+unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL };
+
+static int mp_current_pci_id = 0;
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+
+/* # of MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* MP IRQ source entries */
+int mp_irq_entries;
+
+int nr_ioapics;
+int pic_mode;
+unsigned long mp_lapic_addr = 0;
+
+
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_id = -1U;
+/* Internal processor count */
+static unsigned int num_processors = 0;
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
+
+/* ACPI MADT entry parsing functions */
+#ifdef CONFIG_ACPI_BOOT
+extern struct acpi_boot_flags acpi_boot;
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int acpi_parse_lapic (acpi_table_entry_header *header);
+extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
+extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
+#endif /*CONFIG_X86_LOCAL_APIC*/
+#ifdef CONFIG_X86_IO_APIC
+extern int acpi_parse_ioapic (acpi_table_entry_header *header);
+#endif /*CONFIG_X86_IO_APIC*/
+#endif /*CONFIG_ACPI_BOOT*/
+
+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+	int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+static void __init MP_processor_info (struct mpc_config_processor *m)
+{
+	int ver;
+
+	if (!(m->mpc_cpuflag & CPU_ENABLED))
+		return;
+
+	printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
+		m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
+		m->mpc_apicver);
+
+	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+		Dprintk("    Bootup CPU\n");
+		boot_cpu_id = m->mpc_apicid;
+	}
+	if (num_processors >= NR_CPUS) {
+		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+			" Processor ignored.\n", NR_CPUS);
+		return;
+	}
+	if (num_processors >= maxcpus) {
+		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+			" Processor ignored.\n", maxcpus);
+		return;
+	}
+
+	num_processors++;
+
+	if (m->mpc_apicid > MAX_APICS) {
+		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
+			m->mpc_apicid, MAX_APICS);
+		return;
+	}
+	ver = m->mpc_apicver;
+
+	physid_set(m->mpc_apicid, phys_cpu_present_map);
+	/*
+	 * Validate version
+	 */
+	if (ver == 0x0) {
+		printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
+		ver = 0x10;
+	}
+	apic_version[m->mpc_apicid] = ver;
+	bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+}
+
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+	char str[7];
+
+	memcpy(str, m->mpc_bustype, 6);
+	str[6] = 0;
+	Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+
+	if (strncmp(str, "ISA", 3) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+	} else if (strncmp(str, "EISA", 4) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+	} else if (strncmp(str, "PCI", 3) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+		mp_current_pci_id++;
+	} else if (strncmp(str, "MCA", 3) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+	} else {
+		printk(KERN_ERR "Unknown bustype %s\n", str);
+	}
+}
+
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+	if (!(m->mpc_flags & MPC_APIC_USABLE))
+		return;
+
+	printk("I/O APIC #%d Version %d at 0x%X.\n",
+		m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
+			MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
+	}
+	if (!m->mpc_apicaddr) {
+		printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
+			" found in MP table, skipping!\n");
+		return;
+	}
+	mp_ioapics[nr_ioapics] = *m;
+	nr_ioapics++;
+}
+
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+	mp_irqs [mp_irq_entries] = *m;
+	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+			m->mpc_irqtype, m->mpc_irqflag & 3,
+			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+}
+
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+			m->mpc_irqtype, m->mpc_irqflag & 3,
+			(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+			m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+	/*
+	 * Well it seems all SMP boards in existence
+	 * use ExtINT/LVT1 == LINT0 and
+	 * NMI/LVT2 == LINT1 - the following check
+	 * will show us if this assumptions is false.
+	 * Until then we do not have to add baggage.
+	 */
+	if ((m->mpc_irqtype == mp_ExtINT) &&
+		(m->mpc_destapiclint != 0))
+			BUG();
+	if ((m->mpc_irqtype == mp_NMI) &&
+		(m->mpc_destapiclint != 1))
+			BUG();
+}
+
+/*
+ * Read/parse the MPC
+ */
+
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+	char str[16];
+	int count=sizeof(*mpc);
+	unsigned char *mpt=((unsigned char *)mpc)+count;
+
+	if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+		printk("SMP mptable: bad signature [%c%c%c%c]!\n",
+			mpc->mpc_signature[0],
+			mpc->mpc_signature[1],
+			mpc->mpc_signature[2],
+			mpc->mpc_signature[3]);
+		return 0;
+	}
+	if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+		printk("SMP mptable: checksum error!\n");
+		return 0;
+	}
+	if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+		printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+			mpc->mpc_spec);
+		return 0;
+	}
+	if (!mpc->mpc_lapic) {
+		printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+		return 0;
+	}
+	memcpy(str,mpc->mpc_oem,8);
+	str[8]=0;
+	printk(KERN_INFO "OEM ID: %s ",str);
+
+	memcpy(str,mpc->mpc_productid,12);
+	str[12]=0;
+	printk(KERN_INFO "Product ID: %s ",str);
+
+	printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic);
+
+	/* save the local APIC address, it might be non-default */
+	if (!acpi_lapic)
+	mp_lapic_addr = mpc->mpc_lapic;
+
+	/*
+	 *	Now process the configuration blocks.
+	 */
+	while (count < mpc->mpc_length) {
+		switch(*mpt) {
+			case MP_PROCESSOR:
+			{
+				struct mpc_config_processor *m=
+					(struct mpc_config_processor *)mpt;
+				if (!acpi_lapic)
+				MP_processor_info(m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+			case MP_BUS:
+			{
+				struct mpc_config_bus *m=
+					(struct mpc_config_bus *)mpt;
+				MP_bus_info(m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+			case MP_IOAPIC:
+			{
+				struct mpc_config_ioapic *m=
+					(struct mpc_config_ioapic *)mpt;
+				MP_ioapic_info(m);
+				mpt+=sizeof(*m);
+				count+=sizeof(*m);
+				break;
+			}
+			case MP_INTSRC:
+			{
+				struct mpc_config_intsrc *m=
+					(struct mpc_config_intsrc *)mpt;
+
+				MP_intsrc_info(m);
+				mpt+=sizeof(*m);
+				count+=sizeof(*m);
+				break;
+			}
+			case MP_LINTSRC:
+			{
+				struct mpc_config_lintsrc *m=
+					(struct mpc_config_lintsrc *)mpt;
+				MP_lintsrc_info(m);
+				mpt+=sizeof(*m);
+				count+=sizeof(*m);
+				break;
+			}
+		}
+	}
+	clustered_apic_check();
+	if (!num_processors)
+		printk(KERN_ERR "SMP mptable: no processors registered!\n");
+	return num_processors;
+}
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+	unsigned int port;
+
+	port = 0x4d0 + (irq >> 3);
+	return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+	struct mpc_config_intsrc intsrc;
+	int i;
+	int ELCR_fallback = 0;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqflag = 0;			/* conforming */
+	intsrc.mpc_srcbus = 0;
+	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+
+	intsrc.mpc_irqtype = mp_INT;
+
+	/*
+	 *  If true, we have an ISA/PCI system with no IRQ entries
+	 *  in the MP table. To prevent the PCI interrupts from being set up
+	 *  incorrectly, we try to use the ELCR. The sanity check to see if
+	 *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+	 *  never be level sensitive, so we simply see if the ELCR agrees.
+	 *  If it does, we assume it's valid.
+	 */
+	if (mpc_default_type == 5) {
+		printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+
+		if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+			printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
+		else {
+			printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+			ELCR_fallback = 1;
+		}
+	}
+
+	for (i = 0; i < 16; i++) {
+		switch (mpc_default_type) {
+		case 2:
+			if (i == 0 || i == 13)
+				continue;	/* IRQ0 & IRQ13 not connected */
+			/* fall through */
+		default:
+			if (i == 2)
+				continue;	/* IRQ2 is never connected */
+		}
+
+		if (ELCR_fallback) {
+			/*
+			 *  If the ELCR indicates a level-sensitive interrupt, we
+			 *  copy that information over to the MP table in the
+			 *  irqflag field (level sensitive, active high polarity).
+			 */
+			if (ELCR_trigger(i))
+				intsrc.mpc_irqflag = 13;
+			else
+				intsrc.mpc_irqflag = 0;
+		}
+
+		intsrc.mpc_srcbusirq = i;
+		intsrc.mpc_dstirq = i ? i : 2;		/* IRQ0 to INTIN2 */
+		MP_intsrc_info(&intsrc);
+	}
+
+	intsrc.mpc_irqtype = mp_ExtINT;
+	intsrc.mpc_srcbusirq = 0;
+	intsrc.mpc_dstirq = 0;				/* 8259A to INTIN0 */
+	MP_intsrc_info(&intsrc);
+}
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+	struct mpc_config_processor processor;
+	struct mpc_config_bus bus;
+	struct mpc_config_ioapic ioapic;
+	struct mpc_config_lintsrc lintsrc;
+	int linttypes[2] = { mp_ExtINT, mp_NMI };
+	int i;
+
+	/*
+	 * local APIC has default address
+	 */
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	/*
+	 * 2 CPUs, numbered 0 & 1.
+	 */
+	processor.mpc_type = MP_PROCESSOR;
+	/* Either an integrated APIC or a discrete 82489DX. */
+	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.mpc_cpuflag = CPU_ENABLED;
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+				   (boot_cpu_data.x86_model << 4) |
+				   boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+	for (i = 0; i < 2; i++) {
+		processor.mpc_apicid = i;
+		MP_processor_info(&processor);
+	}
+
+	bus.mpc_type = MP_BUS;
+	bus.mpc_busid = 0;
+	switch (mpc_default_type) {
+		default:
+			printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+				mpc_default_type);
+			/* fall through */
+		case 1:
+		case 5:
+			memcpy(bus.mpc_bustype, "ISA   ", 6);
+			break;
+		case 2:
+		case 6:
+		case 3:
+			memcpy(bus.mpc_bustype, "EISA  ", 6);
+			break;
+		case 4:
+		case 7:
+			memcpy(bus.mpc_bustype, "MCA   ", 6);
+	}
+	MP_bus_info(&bus);
+	if (mpc_default_type > 4) {
+		bus.mpc_busid = 1;
+		memcpy(bus.mpc_bustype, "PCI   ", 6);
+		MP_bus_info(&bus);
+	}
+
+	ioapic.mpc_type = MP_IOAPIC;
+	ioapic.mpc_apicid = 2;
+	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	ioapic.mpc_flags = MPC_APIC_USABLE;
+	ioapic.mpc_apicaddr = 0xFEC00000;
+	MP_ioapic_info(&ioapic);
+
+	/*
+	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
+	 */
+	construct_default_ioirq_mptable(mpc_default_type);
+
+	lintsrc.mpc_type = MP_LINTSRC;
+	lintsrc.mpc_irqflag = 0;		/* conforming */
+	lintsrc.mpc_srcbusid = 0;
+	lintsrc.mpc_srcbusirq = 0;
+	lintsrc.mpc_destapic = MP_APIC_ALL;
+	for (i = 0; i < 2; i++) {
+		lintsrc.mpc_irqtype = linttypes[i];
+		lintsrc.mpc_destapiclint = i;
+		MP_lintsrc_info(&lintsrc);
+	}
+}
+
+static struct intel_mp_floating *mpf_found;
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init get_smp_config (void)
+{
+	struct intel_mp_floating *mpf = mpf_found;
+
+	/*
+ 	 * ACPI may be used to obtain the entire SMP configuration or just to 
+ 	 * enumerate/configure processors (CONFIG_ACPI_BOOT).  Note that 
+ 	 * ACPI supports both logical (e.g. Hyper-Threading) and physical 
+ 	 * processors, where MPS only supports physical.
+ 	 */
+ 	if (acpi_lapic && acpi_ioapic) {
+ 		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+ 		return;
+	}
+ 	else if (acpi_lapic)
+ 		printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+
+	printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+	if (mpf->mpf_feature2 & (1<<7)) {
+		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+		pic_mode = 1;
+	} else {
+		printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+		pic_mode = 0;
+	}
+
+	/*
+	 * Now see if we need to read further.
+	 */
+	if (mpf->mpf_feature1 != 0) {
+
+		printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+		construct_default_ISA_mptable(mpf->mpf_feature1);
+
+	} else if (mpf->mpf_physptr) {
+
+		/*
+		 * Read the physical hardware table.  Anything here will
+		 * override the defaults.
+		 */
+		if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) {
+			smp_found_config = 0;
+			printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+			printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+			return;
+		}
+		/*
+		 * If there are no explicit MP IRQ entries, then we are
+		 * broken.  We set up most of the low 16 IO-APIC pins to
+		 * ISA defaults and hope it will work.
+		 */
+		if (!mp_irq_entries) {
+			struct mpc_config_bus bus;
+
+			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+
+			bus.mpc_type = MP_BUS;
+			bus.mpc_busid = 0;
+			memcpy(bus.mpc_bustype, "ISA   ", 6);
+			MP_bus_info(&bus);
+
+			construct_default_ioirq_mptable(0);
+		}
+
+	} else
+		BUG();
+
+	printk(KERN_INFO "Processors: %d\n", num_processors);
+	/*
+	 * Only use the first configuration found.
+	 */
+}
+
+static int __init smp_scan_config (unsigned long base, unsigned long length)
+{
+	extern void __bad_mpf_size(void); 
+	unsigned int *bp = phys_to_virt(base);
+	struct intel_mp_floating *mpf;
+
+	Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+	if (sizeof(*mpf) != 16)
+		__bad_mpf_size();
+
+	while (length > 0) {
+		mpf = (struct intel_mp_floating *)bp;
+		if ((*bp == SMP_MAGIC_IDENT) &&
+			(mpf->mpf_length == 1) &&
+			!mpf_checksum((unsigned char *)bp, 16) &&
+			((mpf->mpf_specification == 1)
+				|| (mpf->mpf_specification == 4)) ) {
+
+			smp_found_config = 1;
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+			if (mpf->mpf_physptr)
+				reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
+			mpf_found = mpf;
+			return 1;
+		}
+		bp += 4;
+		length -= 16;
+	}
+	return 0;
+}
+
+void __init find_intel_smp (void)
+{
+	unsigned int address;
+
+	/*
+	 * FIXME: Linux assumes you have 640K of base ram..
+	 * this continues the error...
+	 *
+	 * 1) Scan the bottom 1K for a signature
+	 * 2) Scan the top 1K of base RAM
+	 * 3) Scan the 64K of bios
+	 */
+	if (smp_scan_config(0x0,0x400) ||
+		smp_scan_config(639*0x400,0x400) ||
+			smp_scan_config(0xF0000,0x10000))
+		return;
+	/*
+	 * If it is an SMP machine we should know now, unless the
+	 * configuration is in an EISA/MCA bus machine with an
+	 * extended bios data area.
+	 *
+	 * there is a real-mode segmented pointer pointing to the
+	 * 4K EBDA area at 0x40E, calculate and scan it here.
+	 *
+	 * NOTE! There are Linux loaders that will corrupt the EBDA
+	 * area, and as such this kind of SMP config may be less
+	 * trustworthy, simply because the SMP table may have been
+	 * stomped on during early boot. These loaders are buggy and
+	 * should be fixed.
+	 */
+
+	address = *(unsigned short *)phys_to_virt(0x40E);
+	address <<= 4;
+	if (smp_scan_config(address, 0x1000))
+		return;
+
+	/* If we have come this far, we did not find an MP table  */
+	 printk(KERN_INFO "No mptable found.\n");
+}
+
+/*
+ * - Intel MP Configuration Table
+ */
+void __init find_smp_config (void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	find_intel_smp();
+#endif
+}
+
+
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI_BOOT
+
+void __init mp_register_lapic_address (
+	u64			address)
+{
+	mp_lapic_addr = (unsigned long) address;
+
+	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+
+	if (boot_cpu_id == -1U)
+		boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+
+	Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+}
+
+
+void __init mp_register_lapic (
+	u8			id, 
+	u8			enabled)
+{
+	struct mpc_config_processor processor;
+	int			boot_cpu = 0;
+	
+	if (id >= MAX_APICS) {
+		printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
+			id, MAX_APICS);
+		return;
+	}
+
+	if (id == boot_cpu_physical_apicid)
+		boot_cpu = 1;
+
+	processor.mpc_type = MP_PROCESSOR;
+	processor.mpc_apicid = id;
+	processor.mpc_apicver = 0x10; /* TBD: lapic version */
+	processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+	processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
+		(boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+
+	MP_processor_info(&processor);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+#define MP_ISA_BUS		0
+#define MP_MAX_IOAPIC_PIN	127
+
+static struct mp_ioapic_routing {
+	int			apic_id;
+	int			gsi_start;
+	int			gsi_end;
+	u32			pin_programmed[4];
+} mp_ioapic_routing[MAX_IO_APICS];
+
+
+static int mp_find_ioapic (
+	int			gsi)
+{
+	int			i = 0;
+
+	/* Find the IOAPIC that manages this GSI. */
+	for (i = 0; i < nr_ioapics; i++) {
+		if ((gsi >= mp_ioapic_routing[i].gsi_start)
+			&& (gsi <= mp_ioapic_routing[i].gsi_end))
+			return i;
+	}
+
+	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+
+	return -1;
+}
+	
+
+void __init mp_register_ioapic (
+	u8			id, 
+	u32			address,
+	u32			gsi_base)
+{
+	int			idx = 0;
+
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+			"(found %d)\n", MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+	}
+	if (!address) {
+		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+			" found in MADT table, skipping!\n");
+		return;
+	}
+
+	idx = nr_ioapics++;
+
+	mp_ioapics[idx].mpc_type = MP_IOAPIC;
+	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+	mp_ioapics[idx].mpc_apicaddr = address;
+
+	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+	mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
+	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+	
+	/* 
+	 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
+	 * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
+	 */
+	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+	mp_ioapic_routing[idx].gsi_start = gsi_base;
+	mp_ioapic_routing[idx].gsi_end = gsi_base + 
+		io_apic_get_redir_entries(idx);
+
+	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+		"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
+		mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+		mp_ioapic_routing[idx].gsi_start,
+		mp_ioapic_routing[idx].gsi_end);
+
+	return;
+}
+
+
+void __init mp_override_legacy_irq (
+	u8			bus_irq,
+	u8			polarity, 
+	u8			trigger, 
+	u32			gsi)
+{
+	struct mpc_config_intsrc intsrc;
+	int			ioapic = -1;
+	int			pin = -1;
+
+	/* 
+	 * Convert 'gsi' to 'ioapic.pin'.
+	 */
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return;
+	pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+
+	/*
+	 * TBD: This check is for faulty timer entries, where the override
+	 *      erroneously sets the trigger to level, resulting in a HUGE 
+	 *      increase of timer interrupts!
+	 */
+	if ((bus_irq == 0) && (trigger == 3))
+		trigger = 1;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqtype = mp_INT;
+	intsrc.mpc_irqflag = (trigger << 2) | polarity;
+	intsrc.mpc_srcbus = MP_ISA_BUS;
+	intsrc.mpc_srcbusirq = bus_irq;				       /* IRQ */
+	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	   /* APIC ID */
+	intsrc.mpc_dstirq = pin;				    /* INTIN# */
+
+	Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
+		intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+		(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+		intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+
+	mp_irqs[mp_irq_entries] = intsrc;
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!\n");
+
+	return;
+}
+
+
+void __init mp_config_acpi_legacy_irqs (void)
+{
+	struct mpc_config_intsrc intsrc;
+	int			i = 0;
+	int			ioapic = -1;
+
+	/* 
+	 * Fabricate the legacy ISA bus (bus #31).
+	 */
+	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+	/* 
+	 * Locate the IOAPIC that manages the ISA IRQs (0-15). 
+	 */
+	ioapic = mp_find_ioapic(0);
+	if (ioapic < 0)
+		return;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqflag = 0;					/* Conforming */
+	intsrc.mpc_srcbus = MP_ISA_BUS;
+	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+
+	/* 
+	 * Use the default configuration for the IRQs 0-15.  Unless
+	 * overridden by (MADT) interrupt source override entries.
+	 */
+	for (i = 0; i < 16; i++) {
+		int idx;
+
+		for (idx = 0; idx < mp_irq_entries; idx++) {
+			struct mpc_config_intsrc *irq = mp_irqs + idx;
+
+			/* Do we already have a mapping for this ISA IRQ? */
+			if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+				break;
+
+			/* Do we already have a mapping for this IOAPIC pin */
+			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+				(irq->mpc_dstirq == i))
+				break;
+		}
+
+		if (idx != mp_irq_entries) {
+			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+			continue;			/* IRQ already used */
+		}
+
+		intsrc.mpc_irqtype = mp_INT;
+		intsrc.mpc_srcbusirq = i;		   /* Identity mapped */
+		intsrc.mpc_dstirq = i;
+
+		Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+			"%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+			(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+			intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
+			intsrc.mpc_dstirq);
+
+		mp_irqs[mp_irq_entries] = intsrc;
+		if (++mp_irq_entries == MAX_IRQ_SOURCES)
+			panic("Max # of irq sources exceeded!\n");
+	}
+
+	return;
+}
+
+int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
+{
+	int			ioapic = -1;
+	int			ioapic_pin = 0;
+	int			idx, bit = 0;
+
+	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+		return gsi;
+
+#ifdef CONFIG_ACPI_BUS
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (acpi_fadt.sci_int == gsi)
+		return gsi;
+#endif
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0) {
+		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+		return gsi;
+	}
+
+	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+
+	/* 
+	 * Avoid pin reprogramming.  PRTs typically include entries  
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	bit = ioapic_pin % 32;
+	idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+	if (idx > 3) {
+		printk(KERN_ERR "Invalid reference to IOAPIC pin "
+			"%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
+			ioapic_pin);
+		return gsi;
+	}
+	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+		return gsi;
+	}
+
+	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+
+	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+		edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
+		active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
+	return gsi;
+}
+
+#endif /*CONFIG_X86_IO_APIC*/
+#endif /*CONFIG_ACPI_BOOT*/
diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c
new file mode 100644
index 000000000000..598953ab0154
--- /dev/null
+++ b/arch/x86_64/kernel/msr.c
@@ -0,0 +1,279 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * msr.c
+ *
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes.  A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#include <linux/module.h>
+#include <linux/config.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+/* Note: "err" is handled in a funny way below.  Otherwise one version
+   of gcc or another breaks. */
+
+static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx)
+{
+	int err;
+
+	asm volatile ("1:	wrmsr\n"
+		      "2:\n"
+		      ".section .fixup,\"ax\"\n"
+		      "3:	movl %4,%0\n"
+		      "	jmp 2b\n"
+		      ".previous\n"
+		      ".section __ex_table,\"a\"\n"
+		      "	.align 8\n" "	.quad 1b,3b\n" ".previous":"=&bDS" (err)
+		      :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0));
+
+	return err;
+}
+
+static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
+{
+	int err;
+
+	asm volatile ("1:	rdmsr\n"
+		      "2:\n"
+		      ".section .fixup,\"ax\"\n"
+		      "3:	movl %4,%0\n"
+		      "	jmp 2b\n"
+		      ".previous\n"
+		      ".section __ex_table,\"a\"\n"
+		      "	.align 8\n"
+		      "	.quad 1b,3b\n"
+		      ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx)
+		      :"c"(reg), "i"(-EIO), "0"(0));
+
+	return err;
+}
+
+#ifdef CONFIG_SMP
+
+struct msr_command {
+	int cpu;
+	int err;
+	u32 reg;
+	u32 data[2];
+};
+
+static void msr_smp_wrmsr(void *cmd_block)
+{
+	struct msr_command *cmd = (struct msr_command *)cmd_block;
+
+	if (cmd->cpu == smp_processor_id())
+		cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
+}
+
+static void msr_smp_rdmsr(void *cmd_block)
+{
+	struct msr_command *cmd = (struct msr_command *)cmd_block;
+
+	if (cmd->cpu == smp_processor_id())
+		cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
+}
+
+static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
+{
+	struct msr_command cmd;
+	int ret;
+
+	preempt_disable();
+	if (cpu == smp_processor_id()) {
+		ret = wrmsr_eio(reg, eax, edx);
+	} else {
+		cmd.cpu = cpu;
+		cmd.reg = reg;
+		cmd.data[0] = eax;
+		cmd.data[1] = edx;
+
+		smp_call_function(msr_smp_wrmsr, &cmd, 1, 1);
+		ret = cmd.err;
+	}
+	preempt_enable();
+	return ret;
+}
+
+static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
+{
+	struct msr_command cmd;
+	int ret;
+
+	preempt_disable();
+	if (cpu == smp_processor_id()) {
+		ret = rdmsr_eio(reg, eax, edx);
+	} else {
+		cmd.cpu = cpu;
+		cmd.reg = reg;
+
+		smp_call_function(msr_smp_rdmsr, &cmd, 1, 1);
+
+		*eax = cmd.data[0];
+		*edx = cmd.data[1];
+
+		ret = cmd.err;
+	}
+	preempt_enable();
+	return ret;
+}
+
+#else				/* ! CONFIG_SMP */
+
+static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
+{
+	return wrmsr_eio(reg, eax, edx);
+}
+
+static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx)
+{
+	return rdmsr_eio(reg, eax, edx);
+}
+
+#endif				/* ! CONFIG_SMP */
+
+static loff_t msr_seek(struct file *file, loff_t offset, int orig)
+{
+	loff_t ret = -EINVAL;
+
+	lock_kernel();
+	switch (orig) {
+	case 0:
+		file->f_pos = offset;
+		ret = file->f_pos;
+		break;
+	case 1:
+		file->f_pos += offset;
+		ret = file->f_pos;
+	}
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t msr_read(struct file *file, char __user * buf,
+			size_t count, loff_t * ppos)
+{
+	u32 __user *tmp = (u32 __user *) buf;
+	u32 data[2];
+	size_t rv;
+	u32 reg = *ppos;
+	int cpu = iminor(file->f_dentry->d_inode);
+	int err;
+
+	if (count % 8)
+		return -EINVAL;	/* Invalid chunk size */
+
+	for (rv = 0; count; count -= 8) {
+		err = do_rdmsr(cpu, reg, &data[0], &data[1]);
+		if (err)
+			return err;
+		if (copy_to_user(tmp, &data, 8))
+			return -EFAULT;
+		tmp += 2;
+	}
+
+	return ((char __user *)tmp) - buf;
+}
+
+static ssize_t msr_write(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	const u32 __user *tmp = (const u32 __user *)buf;
+	u32 data[2];
+	size_t rv;
+	u32 reg = *ppos;
+	int cpu = iminor(file->f_dentry->d_inode);
+	int err;
+
+	if (count % 8)
+		return -EINVAL;	/* Invalid chunk size */
+
+	for (rv = 0; count; count -= 8) {
+		if (copy_from_user(&data, tmp, 8))
+			return -EFAULT;
+		err = do_wrmsr(cpu, reg, data[0], data[1]);
+		if (err)
+			return err;
+		tmp += 2;
+	}
+
+	return ((char __user *)tmp) - buf;
+}
+
+static int msr_open(struct inode *inode, struct file *file)
+{
+	unsigned int cpu = iminor(file->f_dentry->d_inode);
+	struct cpuinfo_x86 *c = &(cpu_data)[cpu];
+
+	if (cpu >= NR_CPUS || !cpu_online(cpu))
+		return -ENXIO;	/* No such CPU */
+	if (!cpu_has(c, X86_FEATURE_MSR))
+		return -EIO;	/* MSR not supported */
+
+	return 0;
+}
+
+/*
+ * File operations we support
+ */
+static struct file_operations msr_fops = {
+	.owner = THIS_MODULE,
+	.llseek = msr_seek,
+	.read = msr_read,
+	.write = msr_write,
+	.open = msr_open,
+};
+
+static int __init msr_init(void)
+{
+	if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
+		printk(KERN_ERR "msr: unable to get major %d for msr\n",
+		       MSR_MAJOR);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void __exit msr_exit(void)
+{
+	unregister_chrdev(MSR_MAJOR, "cpu/msr");
+}
+
+module_init(msr_init);
+module_exit(msr_exit)
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic MSR driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
new file mode 100644
index 000000000000..d9867de6a626
--- /dev/null
+++ b/arch/x86_64/kernel/nmi.c
@@ -0,0 +1,488 @@
+/*
+ *  linux/arch/x86_64/nmi.c
+ *
+ *  NMI watchdog support on APIC systems
+ *
+ *  Started by Ingo Molnar <mingo@redhat.com>
+ *
+ *  Fixes:
+ *  Mikael Pettersson	: AMD K7 support for local APIC NMI watchdog.
+ *  Mikael Pettersson	: Power Management for local APIC NMI watchdog.
+ *  Pavel Machek and
+ *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/nmi.h>
+#include <linux/sysctl.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/nmi.h>
+#include <asm/msr.h>
+#include <asm/proto.h>
+#include <asm/kdebug.h>
+
+/*
+ * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
+ * - it may be reserved by some other driver, or not
+ * - when not reserved by some other driver, it may be used for
+ *   the NMI watchdog, or not
+ *
+ * This is maintained separately from nmi_active because the NMI
+ * watchdog may also be driven from the I/O APIC timer.
+ */
+static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
+static unsigned int lapic_nmi_owner;
+#define LAPIC_NMI_WATCHDOG	(1<<0)
+#define LAPIC_NMI_RESERVED	(1<<1)
+
+/* nmi_active:
+ * +1: the lapic NMI watchdog is active, but can be disabled
+ *  0: the lapic NMI watchdog has not been set up, and cannot
+ *     be enabled
+ * -1: the lapic NMI watchdog is disabled, but can be enabled
+ */
+int nmi_active;		/* oprofile uses this */
+int panic_on_timeout;
+
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
+unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
+
+/* Note that these events don't tick when the CPU idles. This means
+   the frequency varies with CPU load. */
+
+#define K7_EVNTSEL_ENABLE	(1 << 22)
+#define K7_EVNTSEL_INT		(1 << 20)
+#define K7_EVNTSEL_OS		(1 << 17)
+#define K7_EVNTSEL_USR		(1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
+#define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+#define P6_EVNTSEL0_ENABLE	(1 << 22)
+#define P6_EVNTSEL_INT		(1 << 20)
+#define P6_EVNTSEL_OS		(1 << 17)
+#define P6_EVNTSEL_USR		(1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
+#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+/* Run after command line and cpu_init init, but before all other checks */
+void __init nmi_watchdog_default(void)
+{
+	if (nmi_watchdog != NMI_DEFAULT)
+		return;
+
+	/* For some reason the IO APIC watchdog doesn't work on the AMD
+	   8111 chipset. For now switch to local APIC mode using
+	   perfctr0 there.  On Intel CPUs we don't have code to handle
+	   the perfctr and the IO-APIC seems to work, so use that.  */
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+		nmi_watchdog = NMI_LOCAL_APIC; 
+		printk(KERN_INFO 
+              "Using local APIC NMI watchdog using perfctr0\n");
+	} else {
+		printk(KERN_INFO "Using IO APIC NMI watchdog\n");
+		nmi_watchdog = NMI_IO_APIC;
+	}
+}
+
+/* Why is there no CPUID flag for this? */
+static __init int cpu_has_lapic(void)
+{
+	switch (boot_cpu_data.x86_vendor) { 
+	case X86_VENDOR_INTEL:
+	case X86_VENDOR_AMD: 
+		return boot_cpu_data.x86 >= 6; 
+	/* .... add more cpus here or find a different way to figure this out. */	
+	default:
+		return 0;
+	} 	
+}
+
+int __init check_nmi_watchdog (void)
+{
+	int counts[NR_CPUS];
+	int cpu;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic())  {
+		nmi_watchdog = NMI_NONE;
+		return -1; 
+	}	
+
+	printk(KERN_INFO "testing NMI watchdog ... ");
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		counts[cpu] = cpu_pda[cpu].__nmi_count; 
+	local_irq_enable();
+	mdelay((10*1000)/nmi_hz); // wait 10 ticks
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_SMP
+		/* Check cpu_callin_map here because that is set
+		   after the timer is started. */
+		if (!cpu_isset(cpu, cpu_callin_map))
+			continue;
+#endif
+		if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
+			printk("CPU#%d: NMI appears to be stuck (%d)!\n", 
+			       cpu,
+			       cpu_pda[cpu].__nmi_count);
+			nmi_active = 0;
+			lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
+			return -1;
+		}
+	}
+	printk("OK.\n");
+
+	/* now that we know it works we can reduce NMI frequency to
+	   something more reasonable; makes a difference in some configs */
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		nmi_hz = 1;
+
+	return 0;
+}
+
+int __init setup_nmi_watchdog(char *str)
+{
+	int nmi;
+
+	if (!strncmp(str,"panic",5)) {
+		panic_on_timeout = 1;
+		str = strchr(str, ',');
+		if (!str)
+			return 1;
+		++str;
+	}
+
+	get_option(&str, &nmi);
+
+	if (nmi >= NMI_INVALID)
+		return 0;
+		nmi_watchdog = nmi;
+	return 1;
+}
+
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+static void disable_lapic_nmi_watchdog(void)
+{
+	if (nmi_active <= 0)
+		return;
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		wrmsr(MSR_K7_EVNTSEL0, 0, 0);
+		break;
+	case X86_VENDOR_INTEL:
+		wrmsr(MSR_IA32_EVNTSEL0, 0, 0);
+		break;
+	}
+	nmi_active = -1;
+	/* tell do_nmi() and others that we're not active any more */
+	nmi_watchdog = 0;
+}
+
+static void enable_lapic_nmi_watchdog(void)
+{
+	if (nmi_active < 0) {
+		nmi_watchdog = NMI_LOCAL_APIC;
+		setup_apic_nmi_watchdog();
+	}
+}
+
+int reserve_lapic_nmi(void)
+{
+	unsigned int old_owner;
+
+	spin_lock(&lapic_nmi_owner_lock);
+	old_owner = lapic_nmi_owner;
+	lapic_nmi_owner |= LAPIC_NMI_RESERVED;
+	spin_unlock(&lapic_nmi_owner_lock);
+	if (old_owner & LAPIC_NMI_RESERVED)
+		return -EBUSY;
+	if (old_owner & LAPIC_NMI_WATCHDOG)
+		disable_lapic_nmi_watchdog();
+	return 0;
+}
+
+void release_lapic_nmi(void)
+{
+	unsigned int new_owner;
+
+	spin_lock(&lapic_nmi_owner_lock);
+	new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
+	lapic_nmi_owner = new_owner;
+	spin_unlock(&lapic_nmi_owner_lock);
+	if (new_owner & LAPIC_NMI_WATCHDOG)
+		enable_lapic_nmi_watchdog();
+}
+
+void disable_timer_nmi_watchdog(void)
+{
+	if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+		return;
+
+	disable_irq(0);
+	unset_nmi_callback();
+	nmi_active = -1;
+	nmi_watchdog = NMI_NONE;
+}
+
+void enable_timer_nmi_watchdog(void)
+{
+	if (nmi_active < 0) {
+		nmi_watchdog = NMI_IO_APIC;
+		touch_nmi_watchdog();
+		nmi_active = 1;
+		enable_irq(0);
+	}
+}
+
+#ifdef CONFIG_PM
+
+static int nmi_pm_active; /* nmi_active before suspend */
+
+static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
+{
+	nmi_pm_active = nmi_active;
+	disable_lapic_nmi_watchdog();
+	return 0;
+}
+
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+	if (nmi_pm_active > 0)
+	enable_lapic_nmi_watchdog();
+	return 0;
+}
+
+static struct sysdev_class nmi_sysclass = {
+	set_kset_name("lapic_nmi"),
+	.resume		= lapic_nmi_resume,
+	.suspend	= lapic_nmi_suspend,
+};
+
+static struct sys_device device_lapic_nmi = {
+	.id		= 0,
+	.cls	= &nmi_sysclass,
+};
+
+static int __init init_lapic_nmi_sysfs(void)
+{
+	int error;
+
+	if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
+		return 0;
+
+	error = sysdev_class_register(&nmi_sysclass);
+	if (!error)
+		error = sysdev_register(&device_lapic_nmi);
+	return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+
+#endif	/* CONFIG_PM */
+
+/*
+ * Activate the NMI watchdog via the local APIC.
+ * Original code written by Keith Owens.
+ */
+
+static void setup_k7_watchdog(void)
+{
+	int i;
+	unsigned int evntsel;
+
+	/* No check, so can start with slow frequency */
+	nmi_hz = 1; 
+
+	/* XXX should check these in EFER */
+
+	nmi_perfctr_msr = MSR_K7_PERFCTR0;
+
+	for(i = 0; i < 4; ++i) {
+		/* Simulator may not support it */
+		if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL))
+			return;
+		wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
+	}
+
+	evntsel = K7_EVNTSEL_INT
+		| K7_EVNTSEL_OS
+		| K7_EVNTSEL_USR
+		| K7_NMI_EVENT;
+
+	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+	wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= K7_EVNTSEL_ENABLE;
+	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+}
+
+void setup_apic_nmi_watchdog(void)
+{
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (boot_cpu_data.x86 < 6)
+			return;
+		if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
+			return;
+		setup_k7_watchdog();
+		break;
+	default:
+		return;
+	}
+	lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
+	nmi_active = 1;
+}
+
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up
+ *  here too!]
+ */
+
+static unsigned int
+	last_irq_sums [NR_CPUS],
+	alert_counter [NR_CPUS];
+
+void touch_nmi_watchdog (void)
+{
+	int i;
+
+	/*
+	 * Just reset the alert counters, (other CPUs might be
+	 * spinning on locks we hold):
+	 */
+	for (i = 0; i < NR_CPUS; i++)
+		alert_counter[i] = 0;
+}
+
+void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
+{
+	int sum, cpu;
+
+	cpu = safe_smp_processor_id();
+	sum = read_pda(apic_timer_irqs);
+	if (last_irq_sums[cpu] == sum) {
+		/*
+		 * Ayiee, looks like this CPU is stuck ...
+		 * wait a few IRQs (5 seconds) before doing the oops ...
+		 */
+		alert_counter[cpu]++;
+		if (alert_counter[cpu] == 5*nmi_hz) {
+			if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+							== NOTIFY_STOP) {
+				alert_counter[cpu] = 0; 
+				return;
+			} 
+			die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
+		}
+	} else {
+		last_irq_sums[cpu] = sum;
+		alert_counter[cpu] = 0;
+	}
+	if (nmi_perfctr_msr)
+		wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+}
+
+static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+{
+	return 0;
+}
+ 
+static nmi_callback_t nmi_callback = dummy_nmi_callback;
+ 
+asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
+{
+	int cpu = safe_smp_processor_id();
+
+	nmi_enter();
+	add_pda(__nmi_count,1);
+	if (!nmi_callback(regs, cpu))
+		default_do_nmi(regs);
+	nmi_exit();
+}
+
+void set_nmi_callback(nmi_callback_t callback)
+{
+	nmi_callback = callback;
+}
+
+void unset_nmi_callback(void)
+{
+	nmi_callback = dummy_nmi_callback;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+	unsigned char reason = get_nmi_reason();
+	char buf[64];
+
+	if (!(reason & 0xc0)) {
+		sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+		die_nmi(buf,regs);
+	}
+	return 0;
+}
+
+/*
+ * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ */
+int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int old_state;
+
+	old_state = unknown_nmi_panic;
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (!!old_state == !!unknown_nmi_panic)
+		return 0;
+
+	if (unknown_nmi_panic) {
+		if (reserve_lapic_nmi() < 0) {
+			unknown_nmi_panic = 0;
+			return -EBUSY;
+		} else {
+			set_nmi_callback(unknown_nmi_panic_callback);
+		}
+	} else {
+		release_lapic_nmi();
+		unset_nmi_callback();
+	}
+	return 0;
+}
+
+#endif
+
+EXPORT_SYMBOL(nmi_active);
+EXPORT_SYMBOL(nmi_watchdog);
+EXPORT_SYMBOL(reserve_lapic_nmi);
+EXPORT_SYMBOL(release_lapic_nmi);
+EXPORT_SYMBOL(disable_timer_nmi_watchdog);
+EXPORT_SYMBOL(enable_timer_nmi_watchdog);
+EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
new file mode 100644
index 000000000000..cab471cf3edb
--- /dev/null
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -0,0 +1,60 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <asm/io.h>
+
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA.  This is the scatter-gather version of the
+ * above pci_map_single interface.  Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length.  They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+	       int nents, int direction)
+{
+	int i;
+
+	BUG_ON(direction == DMA_NONE);
+ 	for (i = 0; i < nents; i++ ) {
+		struct scatterlist *s = &sg[i];
+		BUG_ON(!s->page); 
+		s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+		s->dma_length = s->length;
+	}
+	return nents;
+}
+
+EXPORT_SYMBOL(dma_map_sg);
+
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+		  int nents, int dir)
+{
+	int i;
+	for (i = 0; i < nents; i++) { 
+		struct scatterlist *s = &sg[i];
+		BUG_ON(s->page == NULL); 
+		BUG_ON(s->dma_address == 0); 
+		dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
+	} 
+}
+
+EXPORT_SYMBOL(dma_unmap_sg);
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
new file mode 100644
index 000000000000..57f35c68aa34
--- /dev/null
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -0,0 +1,980 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ * 
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB. 
+ *
+ * See Documentation/DMA-mapping.txt for the interface specification.
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/cacheflush.h>
+#include <asm/kdebug.h>
+
+dma_addr_t bad_dma_address;
+
+unsigned long iommu_bus_base;	/* GART remapping area (physical) */
+static unsigned long iommu_size; 	/* size of remapping area bytes */
+static unsigned long iommu_pages;	/* .. and in pages */
+
+u32 *iommu_gatt_base; 		/* Remapping table */
+
+int no_iommu; 
+static int no_agp; 
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow = 1; 
+int force_iommu = 1;
+#else
+int panic_on_overflow = 0;
+int force_iommu = 0;
+#endif
+int iommu_merge = 1;
+int iommu_sac_force = 0; 
+
+/* If this is disabled the IOMMU will use an optimized flushing strategy
+   of only flushing when an mapping is reused. With it true the GART is flushed 
+   for every mapping. Problem is that doing the lazy flush seems to trigger
+   bugs with some popular PCI cards, in particular 3ware (but has been also
+   also seen with Qlogic at least). */
+int iommu_fullflush = 1;
+
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge = 0;
+
+#define MAX_NB 8
+
+/* Allocation bitmap for the remapping area */ 
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
+
+static u32 gart_unmapped_entry; 
+
+#define GPTE_VALID    1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+	(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+
+#define to_pages(addr,size) \
+	(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+#define for_all_nb(dev) \
+	dev = NULL;	\
+	while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\
+	     if (dev->bus->number == 0 && 				     \
+		    (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31))
+
+static struct pci_dev *northbridges[MAX_NB];
+static u32 northbridge_flush_word[MAX_NB];
+
+#define EMERGENCY_PAGES 32 /* = 128KB */ 
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
+static int need_flush; 		/* global flush state. set for each gart wrap */
+static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
+			       size_t size, int dir, int do_panic);
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */
+static struct device fallback_dev = {
+	.bus_id = "fallback device",
+	.coherent_dma_mask = 0xffffffff,
+	.dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+static unsigned long alloc_iommu(int size) 
+{ 	
+	unsigned long offset, flags;
+
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);	
+	offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+	if (offset == -1) {
+		need_flush = 1;
+	       	offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size);
+	}
+	if (offset != -1) { 
+		set_bit_string(iommu_gart_bitmap, offset, size); 
+		next_bit = offset+size; 
+		if (next_bit >= iommu_pages) { 
+			next_bit = 0;
+			need_flush = 1;
+		} 
+	} 
+	if (iommu_fullflush)
+		need_flush = 1;
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+	return offset;
+} 
+
+static void free_iommu(unsigned long offset, int size)
+{ 
+	unsigned long flags;
+	if (size == 1) { 
+		clear_bit(offset, iommu_gart_bitmap); 
+		return;
+	}
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	__clear_bit_string(iommu_gart_bitmap, offset, size);
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+
+/* 
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(struct device *dev)
+{ 
+	unsigned long flags;
+	int flushed = 0;
+	int i, max;
+
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	if (need_flush) { 
+		max = 0;
+		for (i = 0; i < MAX_NB; i++) {
+			if (!northbridges[i]) 
+				continue;
+			pci_write_config_dword(northbridges[i], 0x9c, 
+					       northbridge_flush_word[i] | 1); 
+			flushed++;
+			max = i;
+		}
+		for (i = 0; i <= max; i++) {
+			u32 w;
+			if (!northbridges[i])
+				continue;
+			/* Make sure the hardware actually executed the flush. */
+			do { 
+				pci_read_config_dword(northbridges[i], 0x9c, &w);
+			} while (w & 1);
+		} 
+		if (!flushed) 
+			printk("nothing to flush?\n");
+		need_flush = 0;
+	} 
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+
+/* Allocate DMA memory on node near device */
+noinline
+static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
+{
+	struct page *page;
+	int node;
+	if (dev->bus == &pci_bus_type) {
+		cpumask_t mask;
+		mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
+		node = cpu_to_node(first_cpu(mask));
+	} else
+		node = numa_node_id();
+	page = alloc_pages_node(node, gfp, order);
+	return page ? page_address(page) : NULL;
+}
+
+/* 
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		   unsigned gfp)
+{
+	void *memory;
+	unsigned long dma_mask = 0;
+	u64 bus;
+
+	if (!dev)
+		dev = &fallback_dev;
+	dma_mask = dev->coherent_dma_mask;
+	if (dma_mask == 0) 
+		dma_mask = 0xffffffff; 
+
+	/* Kludge to make it bug-to-bug compatible with i386. i386
+	   uses the normal dma_mask for alloc_coherent. */
+	dma_mask &= *dev->dma_mask;
+
+ again:
+	memory = dma_alloc_pages(dev, gfp, get_order(size));
+	if (memory == NULL)
+		return NULL;
+
+	{
+		int high, mmu;
+		bus = virt_to_bus(memory);
+	        high = (bus + size) >= dma_mask;
+		mmu = high;
+		if (force_iommu && !(gfp & GFP_DMA)) 
+			mmu = 1;
+		if (no_iommu || dma_mask < 0xffffffffUL) { 
+			if (high) {
+				free_pages((unsigned long)memory,
+					   get_order(size));
+
+				if (swiotlb) {
+					return
+					swiotlb_alloc_coherent(dev, size,
+							       dma_handle,
+							       gfp);
+				}
+
+				if (!(gfp & GFP_DMA)) { 
+					gfp |= GFP_DMA; 
+					goto again;
+				}
+				return NULL;
+			}
+			mmu = 0; 
+		} 	
+		memset(memory, 0, size); 
+		if (!mmu) { 
+			*dma_handle = virt_to_bus(memory);
+			return memory;
+		}
+	} 
+
+	*dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0);
+	if (*dma_handle == bad_dma_address)
+		goto error; 
+	flush_gart(dev);
+	return memory; 
+	
+error:
+	if (panic_on_overflow)
+		panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size);
+	free_pages((unsigned long)memory, get_order(size)); 
+	return NULL; 
+}
+
+/* 
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t bus)
+{
+	if (swiotlb) {
+		swiotlb_free_coherent(dev, size, vaddr, bus);
+		return;
+	}
+
+	dma_unmap_single(dev, bus, size, 0);
+	free_pages((unsigned long)vaddr, get_order(size)); 		
+}
+
+#ifdef CONFIG_IOMMU_LEAK
+
+#define SET_LEAK(x) if (iommu_leak_tab) \
+			iommu_leak_tab[x] = __builtin_return_address(0);
+#define CLEAR_LEAK(x) if (iommu_leak_tab) \
+			iommu_leak_tab[x] = NULL;
+
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static void **iommu_leak_tab; 
+static int leak_trace;
+int iommu_leak_pages = 20; 
+void dump_leak(void)
+{
+	int i;
+	static int dump; 
+	if (dump || !iommu_leak_tab) return;
+	dump = 1;
+	show_stack(NULL,NULL);
+	/* Very crude. dump some from the end of the table too */ 
+	printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
+	for (i = 0; i < iommu_leak_pages; i+=2) {
+		printk("%lu: ", iommu_pages-i);
+		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
+		printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
+	} 
+	printk("\n");
+}
+#else
+#define SET_LEAK(x)
+#define CLEAR_LEAK(x)
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir, int do_panic)
+{
+	/* 
+	 * Ran out of IOMMU space for this operation. This is very bad.
+	 * Unfortunately the drivers cannot handle this operation properly.
+	 * Return some non mapped prereserved space in the aperture and 
+	 * let the Northbridge deal with it. This will result in garbage
+	 * in the IO operation. When the size exceeds the prereserved space
+	 * memory corruption will occur or random memory will be DMAed 
+	 * out. Hopefully no network devices use single mappings that big.
+	 */ 
+	
+	printk(KERN_ERR 
+  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+	       size, dev->bus_id);
+
+	if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) {
+		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+			panic("PCI-DMA: Memory would be corrupted\n");
+		if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
+			panic("PCI-DMA: Random memory would be DMAed\n");
+	} 
+
+#ifdef CONFIG_IOMMU_LEAK
+	dump_leak(); 
+#endif
+} 
+
+static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+	u64 mask = *dev->dma_mask;
+	int high = addr + size >= mask;
+	int mmu = high;
+	if (force_iommu) 
+		mmu = 1; 
+	if (no_iommu) { 
+		if (high) 
+			panic("PCI-DMA: high address but no IOMMU.\n"); 
+		mmu = 0; 
+	} 	
+	return mmu; 
+}
+
+static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+	u64 mask = *dev->dma_mask;
+	int high = addr + size >= mask;
+	int mmu = high;
+	if (no_iommu) { 
+		if (high) 
+			panic("PCI-DMA: high address but no IOMMU.\n"); 
+		mmu = 0; 
+	} 	
+	return mmu; 
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
+				size_t size, int dir, int do_panic)
+{ 
+	unsigned long npages = to_pages(phys_mem, size);
+	unsigned long iommu_page = alloc_iommu(npages);
+	int i;
+	if (iommu_page == -1) {
+		if (!nonforced_iommu(dev, phys_mem, size))
+			return phys_mem; 
+		if (panic_on_overflow)
+			panic("dma_map_area overflow %lu bytes\n", size);
+		iommu_full(dev, size, dir, do_panic);
+		return bad_dma_address;
+	}
+
+	for (i = 0; i < npages; i++) {
+		iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+		SET_LEAK(iommu_page + i);
+		phys_mem += PAGE_SIZE;
+	}
+	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+
+/* Map a single area into the IOMMU */
+dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir)
+{
+	unsigned long phys_mem, bus;
+
+	BUG_ON(dir == DMA_NONE);
+
+	if (swiotlb)
+		return swiotlb_map_single(dev,addr,size,dir);
+	if (!dev)
+		dev = &fallback_dev;
+
+	phys_mem = virt_to_phys(addr); 
+	if (!need_iommu(dev, phys_mem, size))
+		return phys_mem; 
+
+	bus = dma_map_area(dev, phys_mem, size, dir, 1);
+	flush_gart(dev); 
+	return bus; 
+} 
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+			       int nents, int dir)
+{
+	int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+	printk(KERN_DEBUG "dma_map_sg overflow\n");
+#endif
+
+ 	for (i = 0; i < nents; i++ ) {
+		struct scatterlist *s = &sg[i];
+		unsigned long addr = page_to_phys(s->page) + s->offset; 
+		if (nonforced_iommu(dev, addr, s->length)) { 
+			addr = dma_map_area(dev, addr, s->length, dir, 0);
+			if (addr == bad_dma_address) { 
+				if (i > 0) 
+					dma_unmap_sg(dev, sg, i, dir);
+				nents = 0; 
+				sg[0].dma_length = 0;
+				break;
+			}
+		}
+		s->dma_address = addr;
+		s->dma_length = s->length;
+	}
+	flush_gart(dev);
+	return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
+		      struct scatterlist *sout, unsigned long pages)
+{
+	unsigned long iommu_start = alloc_iommu(pages);
+	unsigned long iommu_page = iommu_start; 
+	int i;
+
+	if (iommu_start == -1)
+		return -1;
+	
+	for (i = start; i < stopat; i++) {
+		struct scatterlist *s = &sg[i];
+		unsigned long pages, addr;
+		unsigned long phys_addr = s->dma_address;
+		
+		BUG_ON(i > start && s->offset);
+		if (i == start) {
+			*sout = *s; 
+			sout->dma_address = iommu_bus_base;
+			sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+			sout->dma_length = s->length;
+		} else { 
+			sout->dma_length += s->length; 
+		}
+
+		addr = phys_addr;
+		pages = to_pages(s->offset, s->length); 
+		while (pages--) { 
+			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
+			SET_LEAK(iommu_page);
+			addr += PAGE_SIZE;
+			iommu_page++;
+	} 
+	} 
+	BUG_ON(iommu_page - iommu_start != pages);	
+	return 0;
+}
+
+static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
+		      struct scatterlist *sout,
+		      unsigned long pages, int need)
+{
+	if (!need) { 
+		BUG_ON(stopat - start != 1);
+		*sout = sg[start]; 
+		sout->dma_length = sg[start].length; 
+		return 0;
+	} 
+	return __dma_map_cont(sg, start, stopat, sout, pages);
+}
+		
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping. 
+ */
+int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+	int i;
+	int out;
+	int start;
+	unsigned long pages = 0;
+	int need = 0, nextneed;
+
+	BUG_ON(dir == DMA_NONE);
+	if (nents == 0) 
+		return 0;
+
+	if (swiotlb)
+		return swiotlb_map_sg(dev,sg,nents,dir);
+	if (!dev)
+		dev = &fallback_dev;
+
+	out = 0;
+	start = 0;
+	for (i = 0; i < nents; i++) {
+		struct scatterlist *s = &sg[i];
+		dma_addr_t addr = page_to_phys(s->page) + s->offset;
+		s->dma_address = addr;
+		BUG_ON(s->length == 0); 
+
+		nextneed = need_iommu(dev, addr, s->length); 
+
+		/* Handle the previous not yet processed entries */
+		if (i > start) {
+			struct scatterlist *ps = &sg[i-1];
+			/* Can only merge when the last chunk ends on a page 
+			   boundary and the new one doesn't have an offset. */
+			if (!iommu_merge || !nextneed || !need || s->offset ||
+			    (ps->offset + ps->length) % PAGE_SIZE) { 
+				if (dma_map_cont(sg, start, i, sg+out, pages,
+						 need) < 0)
+					goto error;
+				out++;
+				pages = 0;
+				start = i;	
+			}
+		}
+
+		need = nextneed;
+		pages += to_pages(s->offset, s->length);
+	}
+	if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+		goto error;
+	out++;
+	flush_gart(dev);
+	if (out < nents) 
+		sg[out].dma_length = 0; 
+	return out;
+
+error:
+	flush_gart(NULL);
+	dma_unmap_sg(dev, sg, nents, dir);
+	/* When it was forced try again unforced */
+	if (force_iommu) 
+		return dma_map_sg_nonforce(dev, sg, nents, dir);
+	if (panic_on_overflow)
+		panic("dma_map_sg: overflow on %lu pages\n", pages);
+	iommu_full(dev, pages << PAGE_SHIFT, dir, 0);
+	for (i = 0; i < nents; i++)
+		sg[i].dma_address = bad_dma_address;
+	return 0;
+} 
+
+/*
+ * Free a DMA mapping.
+ */ 
+void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
+		      size_t size, int direction)
+{
+	unsigned long iommu_page; 
+	int npages;
+	int i;
+
+	if (swiotlb) {
+		swiotlb_unmap_single(dev,dma_addr,size,direction);
+		return;
+	}
+
+	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 
+	    dma_addr >= iommu_bus_base + iommu_size)
+		return;
+	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;	
+	npages = to_pages(dma_addr, size);
+	for (i = 0; i < npages; i++) { 
+		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 
+		CLEAR_LEAK(iommu_page + i);
+	}
+	free_iommu(iommu_page, npages);
+}
+
+/* 
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */ 
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+	int i;
+	if (swiotlb) {
+		swiotlb_unmap_sg(dev,sg,nents,dir);
+		return;
+	}
+	for (i = 0; i < nents; i++) { 
+		struct scatterlist *s = &sg[i];
+		if (!s->dma_length || !s->length) 
+			break;
+		dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
+	}
+}
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	/* Copied from i386. Doesn't make much sense, because it will 
+	   only work for pci_alloc_coherent.
+	   The caller just has to use GFP_DMA in this case. */
+        if (mask < 0x00ffffff)
+                return 0;
+
+	/* Tell the device to use SAC when IOMMU force is on. 
+	   This allows the driver to use cheaper accesses in some cases.
+
+	   Problem with this is that if we overflow the IOMMU area
+	   and return DAC as fallback address the device may not handle it correctly.
+	   
+	   As a special case some controllers have a 39bit address mode 
+	   that is as efficient as 32bit (aic79xx). Don't force SAC for these.
+	   Assume all masks <= 40 bits are of this type. Normally this doesn't
+	   make any difference, but gives more gentle handling of IOMMU overflow. */
+	if (iommu_sac_force && (mask >= 0xffffffffffULL)) { 
+		printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
+		return 0; 
+	}
+
+	return 1;
+} 
+
+int dma_get_cache_alignment(void)
+{
+	return boot_cpu_data.x86_clflush_size;
+}
+
+EXPORT_SYMBOL(dma_unmap_sg);
+EXPORT_SYMBOL(dma_map_sg);
+EXPORT_SYMBOL(dma_map_single);
+EXPORT_SYMBOL(dma_unmap_single);
+EXPORT_SYMBOL(dma_supported);
+EXPORT_SYMBOL(no_iommu);
+EXPORT_SYMBOL(force_iommu); 
+EXPORT_SYMBOL(bad_dma_address);
+EXPORT_SYMBOL(iommu_bio_merge);
+EXPORT_SYMBOL(iommu_sac_force);
+EXPORT_SYMBOL(dma_get_cache_alignment);
+EXPORT_SYMBOL(dma_alloc_coherent);
+EXPORT_SYMBOL(dma_free_coherent);
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{ 
+	unsigned long a; 
+	if (!iommu_size) { 
+		iommu_size = aper_size; 
+		if (!no_agp) 
+			iommu_size /= 2; 
+	} 
+
+	a = aper + iommu_size; 
+	iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
+
+	if (iommu_size < 64*1024*1024) 
+		printk(KERN_WARNING
+  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
+	
+	return iommu_size;
+} 
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
+{ 
+	unsigned aper_size = 0, aper_base_32;
+	u64 aper_base;
+	unsigned aper_order;
+
+	pci_read_config_dword(dev, 0x94, &aper_base_32); 
+	pci_read_config_dword(dev, 0x90, &aper_order);
+	aper_order = (aper_order >> 1) & 7;	
+
+	aper_base = aper_base_32 & 0x7fff; 
+	aper_base <<= 25;
+
+	aper_size = (32 * 1024 * 1024) << aper_order; 
+	if (aper_base + aper_size >= 0xffffffff || !aper_size)
+		aper_base = 0;
+
+	*size = aper_size;
+	return aper_base;
+} 
+
+/* 
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.  
+ */
+static __init int init_k8_gatt(struct agp_kern_info *info)
+{ 
+	struct pci_dev *dev;
+	void *gatt;
+	unsigned aper_base, new_aper_base;
+	unsigned aper_size, gatt_size, new_aper_size;
+	
+	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+	aper_size = aper_base = info->aper_size = 0;
+	for_all_nb(dev) { 
+		new_aper_base = read_aperture(dev, &new_aper_size); 
+		if (!new_aper_base) 
+			goto nommu; 
+		
+		if (!aper_base) { 
+			aper_size = new_aper_size;
+			aper_base = new_aper_base;
+		} 
+		if (aper_size != new_aper_size || aper_base != new_aper_base) 
+			goto nommu;
+	}
+	if (!aper_base)
+		goto nommu; 
+	info->aper_base = aper_base;
+	info->aper_size = aper_size>>20; 
+
+	gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
+	gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
+	if (!gatt) 
+		panic("Cannot allocate GATT table"); 
+	memset(gatt, 0, gatt_size); 
+	agp_gatt_table = gatt;
+	
+	for_all_nb(dev) { 
+		u32 ctl; 
+		u32 gatt_reg; 
+
+		gatt_reg = __pa(gatt) >> 12; 
+		gatt_reg <<= 4; 
+		pci_write_config_dword(dev, 0x98, gatt_reg);
+		pci_read_config_dword(dev, 0x90, &ctl); 
+
+		ctl |= 1;
+		ctl &= ~((1<<4) | (1<<5));
+
+		pci_write_config_dword(dev, 0x90, ctl); 
+	}
+	flush_gart(NULL); 
+	
+	printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
+	return 0;
+
+ nommu:
+ 	/* Should not happen anymore */
+	printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+	       KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); 
+	return -1; 
+} 
+
+extern int agp_amd64_init(void);
+
+static int __init pci_iommu_init(void)
+{ 
+	struct agp_kern_info info;
+	unsigned long aper_size;
+	unsigned long iommu_start;
+	struct pci_dev *dev;
+	unsigned long scratch;
+	long i;
+
+#ifndef CONFIG_AGP_AMD64
+	no_agp = 1; 
+#else
+	/* Makefile puts PCI initialization via subsys_initcall first. */
+	/* Add other K8 AGP bridge drivers here */
+	no_agp = no_agp || 
+		(agp_amd64_init() < 0) || 
+		(agp_copy_info(agp_bridge, &info) < 0);
+#endif	
+
+	if (swiotlb) { 
+		no_iommu = 1;
+		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+		return -1; 
+	} 
+	
+	if (no_iommu ||
+	    (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) ||
+	    !iommu_aperture ||
+	    (no_agp && init_k8_gatt(&info) < 0)) {
+		printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); 
+		no_iommu = 1;
+		return -1;
+	}
+
+	aper_size = info.aper_size * 1024 * 1024;	
+	iommu_size = check_iommu_size(info.aper_base, aper_size); 
+	iommu_pages = iommu_size >> PAGE_SHIFT; 
+
+	iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
+						    get_order(iommu_pages/8)); 
+	if (!iommu_gart_bitmap) 
+		panic("Cannot allocate iommu bitmap\n"); 
+	memset(iommu_gart_bitmap, 0, iommu_pages/8);
+
+#ifdef CONFIG_IOMMU_LEAK
+	if (leak_trace) { 
+		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+				  get_order(iommu_pages*sizeof(void *)));
+		if (iommu_leak_tab) 
+			memset(iommu_leak_tab, 0, iommu_pages * 8); 
+		else
+			printk("PCI-DMA: Cannot allocate leak trace area\n"); 
+	} 
+#endif
+
+	/* 
+	 * Out of IOMMU space handling.
+	 * Reserve some invalid pages at the beginning of the GART. 
+	 */ 
+	set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
+
+	agp_memory_reserved = iommu_size;	
+	printk(KERN_INFO
+	       "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+	       iommu_size>>20); 
+
+	iommu_start = aper_size - iommu_size;	
+	iommu_bus_base = info.aper_base + iommu_start; 
+	bad_dma_address = iommu_bus_base;
+	iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+	/* 
+	 * Unmap the IOMMU part of the GART. The alias of the page is
+	 * always mapped with cache enabled and there is no full cache
+	 * coherency across the GART remapping. The unmapping avoids
+	 * automatic prefetches from the CPU allocating cache lines in
+	 * there. All CPU accesses are done via the direct mapping to
+	 * the backing memory. The GART address is only used by PCI
+	 * devices. 
+	 */
+	clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
+
+	/* 
+	 * Try to workaround a bug (thanks to BenH) 
+	 * Set unmapped entries to a scratch page instead of 0. 
+	 * Any prefetches that hit unmapped entries won't get an bus abort
+	 * then.
+	 */
+	scratch = get_zeroed_page(GFP_KERNEL); 
+	if (!scratch) 
+		panic("Cannot allocate iommu scratch page");
+	gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+	for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+		iommu_gatt_base[i] = gart_unmapped_entry;
+
+	for_all_nb(dev) {
+		u32 flag; 
+		int cpu = PCI_SLOT(dev->devfn) - 24;
+		if (cpu >= MAX_NB)
+			continue;
+		northbridges[cpu] = dev;
+		pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */
+		northbridge_flush_word[cpu] = flag; 
+	}
+		     
+	flush_gart(NULL);
+
+	return 0;
+} 
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
+
+/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
+         [,forcesac][,fullflush][,nomerge][,biomerge]
+   size  set size of iommu (in bytes) 
+   noagp don't initialize the AGP driver and use full aperture.
+   off   don't use the IOMMU
+   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
+   memaper[=order] allocate an own aperture over RAM with size 32MB^order.  
+   noforce don't force IOMMU usage. Default.
+   force  Force IOMMU.
+   merge  Do lazy merging. This may improve performance on some block devices.
+          Implies force (experimental)
+   biomerge Do merging at the BIO layer. This is more efficient than merge,
+            but should be only done with very big IOMMUs. Implies merge,force.
+   nomerge Don't do SG merging.
+   forcesac For SAC mode for masks <40bits  (experimental)
+   fullflush Flush IOMMU on each allocation (default) 
+   nofullflush Don't use IOMMU fullflush
+   allowed  overwrite iommu off workarounds for specific chipsets.
+   soft	 Use software bounce buffering (default for Intel machines)
+   noaperture Don't touch the aperture for AGP.
+*/
+__init int iommu_setup(char *p)
+{ 
+    int arg;
+
+    while (*p) {
+	    if (!strncmp(p,"noagp",5))
+		    no_agp = 1;
+	    if (!strncmp(p,"off",3))
+		    no_iommu = 1;
+	    if (!strncmp(p,"force",5)) {
+		    force_iommu = 1;
+		    iommu_aperture_allowed = 1;
+	    }
+	    if (!strncmp(p,"allowed",7))
+		    iommu_aperture_allowed = 1;
+	    if (!strncmp(p,"noforce",7)) {
+		    iommu_merge = 0;
+		    force_iommu = 0;
+	    }
+	    if (!strncmp(p, "memaper", 7)) {
+		    fallback_aper_force = 1; 
+		    p += 7; 
+		    if (*p == '=') {
+			    ++p;
+			    if (get_option(&p, &arg))
+				    fallback_aper_order = arg;
+		    }
+	    } 
+	    if (!strncmp(p, "biomerge",8)) {
+		    iommu_bio_merge = 4096;
+		    iommu_merge = 1;
+		    force_iommu = 1;
+	    }
+	    if (!strncmp(p, "panic",5))
+		    panic_on_overflow = 1;
+	    if (!strncmp(p, "nopanic",7))
+		    panic_on_overflow = 0;	    
+	    if (!strncmp(p, "merge",5)) {
+		    iommu_merge = 1;
+		    force_iommu = 1; 
+	    }
+	    if (!strncmp(p, "nomerge",7))
+		    iommu_merge = 0;
+	    if (!strncmp(p, "forcesac",8))
+		    iommu_sac_force = 1;
+	    if (!strncmp(p, "fullflush",8))
+		    iommu_fullflush = 1;
+	    if (!strncmp(p, "nofullflush",11))
+		    iommu_fullflush = 0;
+	    if (!strncmp(p, "soft",4))
+		    swiotlb = 1;
+	    if (!strncmp(p, "noaperture",10))
+		    fix_aperture = 0;
+#ifdef CONFIG_IOMMU_LEAK
+	    if (!strncmp(p,"leak",4)) {
+		    leak_trace = 1;
+		    p += 4; 
+		    if (*p == '=') ++p;
+		    if (isdigit(*p) && get_option(&p, &arg))
+			    iommu_leak_pages = arg;
+	    } else
+#endif
+	    if (isdigit(*p) && get_option(&p, &arg)) 
+		    iommu_size = arg;
+	    p += strcspn(p, ",");
+	    if (*p == ',')
+		    ++p;
+    }
+    return 1;
+} 
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
new file mode 100644
index 000000000000..67d90b89af0b
--- /dev/null
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -0,0 +1,94 @@
+/* Fallback functions when the main IOMMU code is not compiled in. This
+   code is roughly equivalent to i386. */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <asm/proto.h>
+#include <asm/processor.h>
+
+int iommu_merge = 0;
+EXPORT_SYMBOL(iommu_merge);
+
+dma_addr_t bad_dma_address;
+EXPORT_SYMBOL(bad_dma_address);
+
+int iommu_bio_merge = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+
+int iommu_sac_force = 0;
+EXPORT_SYMBOL(iommu_sac_force);
+
+/* 
+ * Dummy IO MMU functions
+ */
+
+void *dma_alloc_coherent(struct device *hwdev, size_t size,
+			 dma_addr_t *dma_handle, unsigned gfp)
+{
+	void *ret;
+	u64 mask;
+	int order = get_order(size);
+
+	if (hwdev)
+		mask = hwdev->coherent_dma_mask & *hwdev->dma_mask;
+	else
+		mask = 0xffffffff;
+	for (;;) {
+		ret = (void *)__get_free_pages(gfp, order);
+		if (ret == NULL)
+			return NULL;
+		*dma_handle = virt_to_bus(ret);
+		if ((*dma_handle & ~mask) == 0)
+			break;
+		free_pages((unsigned long)ret, order);
+		if (gfp & GFP_DMA)
+			return NULL;
+		gfp |= GFP_DMA;
+	}
+
+	memset(ret, 0, size);
+	return ret;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+void dma_free_coherent(struct device *hwdev, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+int dma_supported(struct device *hwdev, u64 mask)
+{
+        /*
+         * we fall back to GFP_DMA when the mask isn't all 1s,
+         * so we can't guarantee allocations that must be
+         * within a tighter range than GFP_DMA..
+	 * RED-PEN this won't work for pci_map_single. Caller has to
+	 * use GFP_DMA in the first place.
+         */
+        if (mask < 0x00ffffff)
+                return 0;
+
+	return 1;
+} 
+EXPORT_SYMBOL(dma_supported);
+
+int dma_get_cache_alignment(void)
+{
+	return boot_cpu_data.x86_clflush_size;
+}
+EXPORT_SYMBOL(dma_get_cache_alignment);
+
+static int __init check_ram(void) 
+{ 
+	if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { 
+		printk(
+		KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n"
+		KERN_ERR "WARNING 32bit PCI may malfunction.\n");
+	} 
+	return 0;
+} 
+__initcall(check_ram);
+
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
new file mode 100644
index 000000000000..9922d2ba24a3
--- /dev/null
+++ b/arch/x86_64/kernel/process.c
@@ -0,0 +1,770 @@
+/*
+ *  linux/arch/x86-64/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  X86-64 port
+ *	Andi Kleen.
+ * 
+ *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <stdarg.h>
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/module.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/ptrace.h>
+#include <linux/utsname.h>
+#include <linux/random.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/mmu_context.h>
+#include <asm/pda.h>
+#include <asm/prctl.h>
+#include <asm/kdebug.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+
+asmlinkage extern void ret_from_fork(void);
+
+unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
+
+static atomic_t hlt_counter = ATOMIC_INIT(0);
+
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+
+void disable_hlt(void)
+{
+	atomic_inc(&hlt_counter);
+}
+
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+	atomic_dec(&hlt_counter);
+}
+
+EXPORT_SYMBOL(enable_hlt);
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+	if (!atomic_read(&hlt_counter)) {
+		local_irq_disable();
+		if (!need_resched())
+			safe_halt();
+		else
+			local_irq_enable();
+	}
+}
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle (void)
+{
+	int oldval;
+
+	local_irq_enable();
+
+	/*
+	 * Deal with another CPU just having chosen a thread to
+	 * run here:
+	 */
+	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
+
+	if (!oldval) {
+		set_thread_flag(TIF_POLLING_NRFLAG); 
+		asm volatile(
+			"2:"
+			"testl %0,%1;"
+			"rep; nop;"
+			"je 2b;"
+			: :
+			"i" (_TIF_NEED_RESCHED), 
+			"m" (current_thread_info()->flags));
+	} else {
+		set_need_resched();
+	}
+}
+
+void cpu_idle_wait(void)
+{
+	unsigned int cpu, this_cpu = get_cpu();
+	cpumask_t map;
+
+	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+	put_cpu();
+
+	cpus_clear(map);
+	for_each_online_cpu(cpu) {
+		per_cpu(cpu_idle_state, cpu) = 1;
+		cpu_set(cpu, map);
+	}
+
+	__get_cpu_var(cpu_idle_state) = 0;
+
+	wmb();
+	do {
+		ssleep(1);
+		for_each_online_cpu(cpu) {
+			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+				cpu_clear(cpu, map);
+		}
+		cpus_and(map, map, cpu_online_map);
+	} while (!cpus_empty(map));
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle (void)
+{
+	/* endless idle loop with no priority at all */
+	while (1) {
+		while (!need_resched()) {
+			void (*idle)(void);
+
+			if (__get_cpu_var(cpu_idle_state))
+				__get_cpu_var(cpu_idle_state) = 0;
+
+			rmb();
+			idle = pm_idle;
+			if (!idle)
+				idle = default_idle;
+			idle();
+		}
+
+		schedule();
+	}
+}
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ */
+static void mwait_idle(void)
+{
+	local_irq_enable();
+
+	if (!need_resched()) {
+		set_thread_flag(TIF_POLLING_NRFLAG);
+		do {
+			__monitor((void *)&current_thread_info()->flags, 0, 0);
+			if (need_resched())
+				break;
+			__mwait(0, 0);
+		} while (!need_resched());
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+	}
+}
+
+void __init select_idle_routine(const struct cpuinfo_x86 *c)
+{
+	static int printed;
+	if (cpu_has(c, X86_FEATURE_MWAIT)) {
+		/*
+		 * Skip, if setup has overridden idle.
+		 * One CPU supports mwait => All CPUs supports mwait
+		 */
+		if (!pm_idle) {
+			if (!printed) {
+				printk("using mwait in idle threads.\n");
+				printed = 1;
+			}
+			pm_idle = mwait_idle;
+		}
+	}
+}
+
+static int __init idle_setup (char *str)
+{
+	if (!strncmp(str, "poll", 4)) {
+		printk("using polling idle threads.\n");
+		pm_idle = poll_idle;
+	}
+
+	boot_option_idle_override = 1;
+	return 1;
+}
+
+__setup("idle=", idle_setup);
+
+/* Prints also some state that isn't saved in the pt_regs */ 
+void __show_regs(struct pt_regs * regs)
+{
+	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+	unsigned int fsindex,gsindex;
+	unsigned int ds,cs,es; 
+
+	printk("\n");
+	print_modules();
+	printk("Pid: %d, comm: %.20s %s %s\n", 
+	       current->pid, current->comm, print_tainted(), system_utsname.release);
+	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
+	printk_address(regs->rip); 
+	printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+	       regs->rax, regs->rbx, regs->rcx);
+	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+	       regs->rdx, regs->rsi, regs->rdi); 
+	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+	       regs->rbp, regs->r8, regs->r9); 
+	printk("R10: %016lx R11: %016lx R12: %016lx\n",
+	       regs->r10, regs->r11, regs->r12); 
+	printk("R13: %016lx R14: %016lx R15: %016lx\n",
+	       regs->r13, regs->r14, regs->r15); 
+
+	asm("movl %%ds,%0" : "=r" (ds)); 
+	asm("movl %%cs,%0" : "=r" (cs)); 
+	asm("movl %%es,%0" : "=r" (es)); 
+	asm("movl %%fs,%0" : "=r" (fsindex));
+	asm("movl %%gs,%0" : "=r" (gsindex));
+
+	rdmsrl(MSR_FS_BASE, fs);
+	rdmsrl(MSR_GS_BASE, gs); 
+	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
+
+	asm("movq %%cr0, %0": "=r" (cr0));
+	asm("movq %%cr2, %0": "=r" (cr2));
+	asm("movq %%cr3, %0": "=r" (cr3));
+	asm("movq %%cr4, %0": "=r" (cr4));
+
+	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
+	       fs,fsindex,gs,gsindex,shadowgs); 
+	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
+	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+}
+
+void show_regs(struct pt_regs *regs)
+{
+	__show_regs(regs);
+	show_trace(&regs->rsp);
+}
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+	struct task_struct *me = current;
+	struct thread_struct *t = &me->thread;
+	if (me->thread.io_bitmap_ptr) { 
+		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+
+		kfree(t->io_bitmap_ptr);
+		t->io_bitmap_ptr = NULL;
+		/*
+		 * Careful, clear this in the TSS too:
+		 */
+		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+		t->io_bitmap_max = 0;
+		put_cpu();
+	}
+}
+
+void flush_thread(void)
+{
+	struct task_struct *tsk = current;
+	struct thread_info *t = current_thread_info();
+
+	if (t->flags & _TIF_ABI_PENDING)
+		t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
+
+	tsk->thread.debugreg0 = 0;
+	tsk->thread.debugreg1 = 0;
+	tsk->thread.debugreg2 = 0;
+	tsk->thread.debugreg3 = 0;
+	tsk->thread.debugreg6 = 0;
+	tsk->thread.debugreg7 = 0;
+	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
+	/*
+	 * Forget coprocessor state..
+	 */
+	clear_fpu(tsk);
+	clear_used_math();
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+	if (dead_task->mm) {
+		if (dead_task->mm->context.size) {
+			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+					dead_task->comm,
+					dead_task->mm->context.ldt,
+					dead_task->mm->context.size);
+			BUG();
+		}
+	}
+}
+
+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+{
+	struct user_desc ud = { 
+		.base_addr = addr,
+		.limit = 0xfffff,
+		.seg_32bit = 1,
+		.limit_in_pages = 1,
+		.useable = 1,
+	};
+	struct n_desc_struct *desc = (void *)t->thread.tls_array;
+	desc += tls;
+	desc->a = LDT_entry_a(&ud); 
+	desc->b = LDT_entry_b(&ud); 
+}
+
+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+{
+	struct desc_struct *desc = (void *)t->thread.tls_array;
+	desc += tls;
+	return desc->base0 | 
+		(((u32)desc->base1) << 16) | 
+		(((u32)desc->base2) << 24);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+	unlazy_fpu(tsk);
+}
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
+		unsigned long unused,
+	struct task_struct * p, struct pt_regs * regs)
+{
+	int err;
+	struct pt_regs * childregs;
+	struct task_struct *me = current;
+
+	childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
+
+	*childregs = *regs;
+
+	childregs->rax = 0;
+	childregs->rsp = rsp;
+	if (rsp == ~0UL) {
+		childregs->rsp = (unsigned long)childregs;
+	}
+
+	p->thread.rsp = (unsigned long) childregs;
+	p->thread.rsp0 = (unsigned long) (childregs+1);
+	p->thread.userrsp = me->thread.userrsp; 
+
+	set_ti_thread_flag(p->thread_info, TIF_FORK);
+
+	p->thread.fs = me->thread.fs;
+	p->thread.gs = me->thread.gs;
+
+	asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
+	asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
+	asm("movl %%es,%0" : "=m" (p->thread.es));
+	asm("movl %%ds,%0" : "=m" (p->thread.ds));
+
+	if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 
+		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+		if (!p->thread.io_bitmap_ptr) {
+			p->thread.io_bitmap_max = 0;
+			return -ENOMEM;
+		}
+		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+	} 
+
+	/*
+	 * Set a new TLS for the child thread?
+	 */
+	if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+		if (test_thread_flag(TIF_IA32))
+			err = ia32_child_tls(p, childregs); 
+		else 			
+#endif	 
+			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
+		if (err) 
+			goto out;
+	}
+	err = 0;
+out:
+	if (err && p->thread.io_bitmap_ptr) {
+		kfree(p->thread.io_bitmap_ptr);
+		p->thread.io_bitmap_max = 0;
+	}
+	return err;
+}
+
+/*
+ * This special macro can be used to load a debugging register
+ */
+#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
+
+/*
+ *	switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized: 
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ */
+struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	struct thread_struct *prev = &prev_p->thread,
+				 *next = &next_p->thread;
+	int cpu = smp_processor_id();  
+	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+	unlazy_fpu(prev_p);
+
+	/*
+	 * Reload esp0, LDT and the page table pointer:
+	 */
+	tss->rsp0 = next->rsp0;
+
+	/* 
+	 * Switch DS and ES.
+	 * This won't pick up thread selector changes, but I guess that is ok.
+	 */
+	asm volatile("movl %%es,%0" : "=m" (prev->es)); 
+	if (unlikely(next->es | prev->es))
+		loadsegment(es, next->es); 
+	
+	asm volatile ("movl %%ds,%0" : "=m" (prev->ds)); 
+	if (unlikely(next->ds | prev->ds))
+		loadsegment(ds, next->ds);
+
+	load_TLS(next, cpu);
+
+	/* 
+	 * Switch FS and GS.
+	 */
+	{ 
+		unsigned fsindex;
+		asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
+		/* segment register != 0 always requires a reload. 
+		   also reload when it has changed. 
+		   when prev process used 64bit base always reload
+		   to avoid an information leak. */
+		if (unlikely(fsindex | next->fsindex | prev->fs)) {
+			loadsegment(fs, next->fsindex);
+			/* check if the user used a selector != 0
+	                 * if yes clear 64bit base, since overloaded base
+                         * is always mapped to the Null selector
+                         */
+			if (fsindex)
+			prev->fs = 0;				
+		}
+		/* when next process has a 64bit base use it */
+		if (next->fs) 
+			wrmsrl(MSR_FS_BASE, next->fs); 
+		prev->fsindex = fsindex;
+	}
+	{ 
+		unsigned gsindex;
+		asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
+		if (unlikely(gsindex | next->gsindex | prev->gs)) {
+			load_gs_index(next->gsindex);
+			if (gsindex)
+			prev->gs = 0;				
+		}
+		if (next->gs)
+			wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
+		prev->gsindex = gsindex;
+	}
+
+	/* 
+	 * Switch the PDA context.
+	 */
+	prev->userrsp = read_pda(oldrsp); 
+	write_pda(oldrsp, next->userrsp); 
+	write_pda(pcurrent, next_p); 
+	write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+
+	/*
+	 * Now maybe reload the debug registers
+	 */
+	if (unlikely(next->debugreg7)) {
+		loaddebug(next, 0);
+		loaddebug(next, 1);
+		loaddebug(next, 2);
+		loaddebug(next, 3);
+		/* no 4 and 5 */
+		loaddebug(next, 6);
+		loaddebug(next, 7);
+	}
+
+
+	/* 
+	 * Handle the IO bitmap 
+	 */ 
+	if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+		if (next->io_bitmap_ptr)
+			/*
+			 * Copy the relevant range of the IO bitmap.
+			 * Normally this is 128 bytes or less:
+ 			 */
+			memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+				max(prev->io_bitmap_max, next->io_bitmap_max));
+		else {
+			/*
+			 * Clear any possible leftover bits:
+			 */
+			memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+		}
+	}
+
+	return prev_p;
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage 
+long sys_execve(char __user *name, char __user * __user *argv,
+		char __user * __user *envp, struct pt_regs regs)
+{
+	long error;
+	char * filename;
+
+	filename = getname(name);
+	error = PTR_ERR(filename);
+	if (IS_ERR(filename)) 
+		return error;
+	error = do_execve(filename, argv, envp, &regs); 
+	if (error == 0) {
+		task_lock(current);
+		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
+	putname(filename);
+	return error;
+}
+
+void set_personality_64bit(void)
+{
+	/* inherit personality from parent */
+
+	/* Make sure to be in 64bit mode */
+	clear_thread_flag(TIF_IA32); 
+
+	/* TBD: overwrites user setup. Should have two bits.
+	   But 64bit processes have always behaved this way,
+	   so it's not too bad. The main problem is just that
+   	   32bit childs are affected again. */
+	current->personality &= ~READ_IMPLIES_EXEC;
+}
+
+asmlinkage long sys_fork(struct pt_regs *regs)
+{
+	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
+}
+
+asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+{
+	if (!newsp)
+		newsp = regs->rsp;
+	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage long sys_vfork(struct pt_regs *regs)
+{
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
+		    NULL, NULL);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long stack;
+	u64 fp,rip;
+	int count = 0;
+
+	if (!p || p == current || p->state==TASK_RUNNING)
+		return 0; 
+	stack = (unsigned long)p->thread_info; 
+	if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
+		return 0;
+	fp = *(u64 *)(p->thread.rsp);
+	do { 
+		if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+			return 0; 
+		rip = *(u64 *)(fp+8); 
+		if (!in_sched_functions(rip))
+			return rip; 
+		fp = *(u64 *)fp; 
+	} while (count++ < 16); 
+	return 0;
+}
+
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+{ 
+	int ret = 0; 
+	int doit = task == current;
+	int cpu;
+
+	switch (code) { 
+	case ARCH_SET_GS:
+		if (addr >= TASK_SIZE) 
+			return -EPERM; 
+		cpu = get_cpu();
+		/* handle small bases via the GDT because that's faster to 
+		   switch. */
+		if (addr <= 0xffffffff) {  
+			set_32bit_tls(task, GS_TLS, addr); 
+			if (doit) { 
+				load_TLS(&task->thread, cpu);
+				load_gs_index(GS_TLS_SEL); 
+			}
+			task->thread.gsindex = GS_TLS_SEL; 
+			task->thread.gs = 0;
+		} else { 
+			task->thread.gsindex = 0;
+			task->thread.gs = addr;
+			if (doit) {
+		load_gs_index(0);
+		ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 
+			} 
+		}
+		put_cpu();
+		break;
+	case ARCH_SET_FS:
+		/* Not strictly needed for fs, but do it for symmetry
+		   with gs */
+		if (addr >= TASK_SIZE)
+			return -EPERM; 
+		cpu = get_cpu();
+		/* handle small bases via the GDT because that's faster to 
+		   switch. */
+		if (addr <= 0xffffffff) { 
+			set_32bit_tls(task, FS_TLS, addr);
+			if (doit) { 
+				load_TLS(&task->thread, cpu); 
+				asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+			}
+			task->thread.fsindex = FS_TLS_SEL;
+			task->thread.fs = 0;
+		} else { 
+			task->thread.fsindex = 0;
+			task->thread.fs = addr;
+			if (doit) {
+				/* set the selector to 0 to not confuse
+				   __switch_to */
+		asm volatile("movl %0,%%fs" :: "r" (0));
+		ret = checking_wrmsrl(MSR_FS_BASE, addr); 
+			}
+		}
+		put_cpu();
+		break;
+	case ARCH_GET_FS: { 
+		unsigned long base; 
+		if (task->thread.fsindex == FS_TLS_SEL)
+			base = read_32bit_tls(task, FS_TLS);
+		else if (doit) {
+			rdmsrl(MSR_FS_BASE, base);
+		} else
+			base = task->thread.fs;
+		ret = put_user(base, (unsigned long __user *)addr); 
+		break; 
+	}
+	case ARCH_GET_GS: { 
+		unsigned long base;
+		if (task->thread.gsindex == GS_TLS_SEL)
+			base = read_32bit_tls(task, GS_TLS);
+		else if (doit) {
+			rdmsrl(MSR_KERNEL_GS_BASE, base);
+		} else
+			base = task->thread.gs;
+		ret = put_user(base, (unsigned long __user *)addr); 
+		break;
+	}
+
+	default:
+		ret = -EINVAL;
+		break;
+	} 
+
+	return ret;	
+} 
+
+long sys_arch_prctl(int code, unsigned long addr)
+{
+	return do_arch_prctl(current, code, addr);
+} 
+
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+	struct pt_regs *pp, ptregs;
+
+	pp = (struct pt_regs *)(tsk->thread.rsp0);
+	--pp; 
+
+	ptregs = *pp; 
+	ptregs.cs &= 0xffff;
+	ptregs.ss &= 0xffff;
+
+	elf_core_copy_regs(regs, &ptregs);
+ 
+	return 1;
+}
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+	if (randomize_va_space)
+		sp -= get_random_int() % 8192;
+	return sp & ~0xf;
+}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
new file mode 100644
index 000000000000..0b7b101debdf
--- /dev/null
+++ b/arch/x86_64/kernel/ptrace.c
@@ -0,0 +1,547 @@
+/* ptrace.c */
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ * x86-64 port 2000-2002 Andi Kleen
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/* determines which flags the user has access to. */
+/* 1 = access 0 = no access */
+#define FLAG_MASK 0x44dd5UL
+
+/* set's the trap flag. */
+#define TRAP_FLAG 0x100UL
+
+/*
+ * eflags and offset of eflags on child stack..
+ */
+#define EFLAGS offsetof(struct pt_regs, eflags)
+#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
+
+/*
+ * this routine will get a word off of the processes privileged stack. 
+ * the offset is how far from the base addr as stored in the TSS.  
+ * this routine assumes that all the privileged stacks are in our
+ * data space.
+ */   
+static inline unsigned long get_stack_long(struct task_struct *task, int offset)
+{
+	unsigned char *stack;
+
+	stack = (unsigned char *)task->thread.rsp0;
+	stack += offset;
+	return (*((unsigned long *)stack));
+}
+
+/*
+ * this routine will put a word on the processes privileged stack. 
+ * the offset is how far from the base addr as stored in the TSS.  
+ * this routine assumes that all the privileged stacks are in our
+ * data space.
+ */
+static inline long put_stack_long(struct task_struct *task, int offset,
+	unsigned long data)
+{
+	unsigned char * stack;
+
+	stack = (unsigned char *) task->thread.rsp0;
+	stack += offset;
+	*(unsigned long *) stack = data;
+	return 0;
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{ 
+	long tmp;
+
+	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+	tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
+	put_stack_long(child, EFL_OFFSET, tmp);
+}
+
+static int putreg(struct task_struct *child,
+	unsigned long regno, unsigned long value)
+{
+	unsigned long tmp; 
+	
+	/* Some code in the 64bit emulation may not be 64bit clean.
+	   Don't take any chances. */
+	if (test_tsk_thread_flag(child, TIF_IA32))
+		value &= 0xffffffff;
+	switch (regno) {
+		case offsetof(struct user_regs_struct,fs):
+			if (value && (value & 3) != 3)
+				return -EIO;
+			child->thread.fsindex = value & 0xffff; 
+			return 0;
+		case offsetof(struct user_regs_struct,gs):
+			if (value && (value & 3) != 3)
+				return -EIO;
+			child->thread.gsindex = value & 0xffff;
+			return 0;
+		case offsetof(struct user_regs_struct,ds):
+			if (value && (value & 3) != 3)
+				return -EIO;
+			child->thread.ds = value & 0xffff;
+			return 0;
+		case offsetof(struct user_regs_struct,es): 
+			if (value && (value & 3) != 3)
+				return -EIO;
+			child->thread.es = value & 0xffff;
+			return 0;
+		case offsetof(struct user_regs_struct,ss):
+			if ((value & 3) != 3)
+				return -EIO;
+			value &= 0xffff;
+			return 0;
+		case offsetof(struct user_regs_struct,fs_base):
+			if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
+				return -EIO; 
+			child->thread.fs = value;
+			return 0;
+		case offsetof(struct user_regs_struct,gs_base):
+			if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
+				return -EIO; 
+			child->thread.gs = value;
+			return 0;
+		case offsetof(struct user_regs_struct, eflags):
+			value &= FLAG_MASK;
+			tmp = get_stack_long(child, EFL_OFFSET); 
+			tmp &= ~FLAG_MASK; 
+			value |= tmp;
+			break;
+		case offsetof(struct user_regs_struct,cs): 
+			if ((value & 3) != 3)
+				return -EIO;
+			value &= 0xffff;
+			break;
+	}
+	put_stack_long(child, regno - sizeof(struct pt_regs), value);
+	return 0;
+}
+
+static unsigned long getreg(struct task_struct *child, unsigned long regno)
+{
+	unsigned long val;
+	switch (regno) {
+		case offsetof(struct user_regs_struct, fs):
+			return child->thread.fsindex;
+		case offsetof(struct user_regs_struct, gs):
+			return child->thread.gsindex;
+		case offsetof(struct user_regs_struct, ds):
+			return child->thread.ds;
+		case offsetof(struct user_regs_struct, es):
+			return child->thread.es; 
+		case offsetof(struct user_regs_struct, fs_base):
+			return child->thread.fs;
+		case offsetof(struct user_regs_struct, gs_base):
+			return child->thread.gs;
+		default:
+			regno = regno - sizeof(struct pt_regs);
+			val = get_stack_long(child, regno);
+			if (test_tsk_thread_flag(child, TIF_IA32))
+				val &= 0xffffffff;
+			return val;
+	}
+
+}
+
+asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data)
+{
+	struct task_struct *child;
+	long i, ret;
+	unsigned ui;
+
+	/* This lock_kernel fixes a subtle race with suid exec */
+	lock_kernel();
+	ret = -EPERM;
+	if (request == PTRACE_TRACEME) {
+		/* are we already being traced? */
+		if (current->ptrace & PT_PTRACED)
+			goto out;
+		ret = security_ptrace(current->parent, current);
+		if (ret)
+			goto out;
+		/* set the ptrace bit in the process flags. */
+		current->ptrace |= PT_PTRACED;
+		ret = 0;
+		goto out;
+	}
+	ret = -ESRCH;
+	read_lock(&tasklist_lock);
+	child = find_task_by_pid(pid);
+	if (child)
+		get_task_struct(child);
+	read_unlock(&tasklist_lock);
+	if (!child)
+		goto out;
+
+	ret = -EPERM;
+	if (pid == 1)		/* you may not mess with init */
+		goto out_tsk;
+
+	if (request == PTRACE_ATTACH) {
+		ret = ptrace_attach(child);
+		goto out_tsk;
+	}
+	ret = ptrace_check_attach(child, request == PTRACE_KILL); 
+	if (ret < 0) 
+		goto out_tsk;
+
+	switch (request) {
+	/* when I and D space are separate, these will need to be fixed. */
+	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+	case PTRACE_PEEKDATA: {
+		unsigned long tmp;
+		int copied;
+
+		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+		ret = -EIO;
+		if (copied != sizeof(tmp))
+			break;
+		ret = put_user(tmp,(unsigned long __user *) data);
+		break;
+	}
+
+	/* read the word at location addr in the USER area. */
+	case PTRACE_PEEKUSR: {
+		unsigned long tmp;
+
+		ret = -EIO;
+		if ((addr & 7) ||
+		    addr > sizeof(struct user) - 7)
+			break;
+
+		switch (addr) { 
+		case 0 ... sizeof(struct user_regs_struct):
+			tmp = getreg(child, addr);
+			break;
+		case offsetof(struct user, u_debugreg[0]):
+			tmp = child->thread.debugreg0;
+			break;
+		case offsetof(struct user, u_debugreg[1]):
+			tmp = child->thread.debugreg1;
+			break;
+		case offsetof(struct user, u_debugreg[2]):
+			tmp = child->thread.debugreg2;
+			break;
+		case offsetof(struct user, u_debugreg[3]):
+			tmp = child->thread.debugreg3;
+			break;
+		case offsetof(struct user, u_debugreg[6]):
+			tmp = child->thread.debugreg6;
+			break;
+		case offsetof(struct user, u_debugreg[7]):
+			tmp = child->thread.debugreg7;
+			break;
+		default:
+			tmp = 0;
+			break;
+		}
+		ret = put_user(tmp,(unsigned long __user *) data);
+		break;
+	}
+
+	/* when I and D space are separate, this will have to be fixed. */
+	case PTRACE_POKETEXT: /* write the word at location addr. */
+	case PTRACE_POKEDATA:
+		ret = 0;
+		if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
+			break;
+		ret = -EIO;
+		break;
+
+	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+		ret = -EIO;
+		if ((addr & 7) ||
+		    addr > sizeof(struct user) - 7)
+			break;
+
+		switch (addr) { 
+		case 0 ... sizeof(struct user_regs_struct): 
+			ret = putreg(child, addr, data);
+			break;
+		/* Disallows to set a breakpoint into the vsyscall */
+		case offsetof(struct user, u_debugreg[0]):
+			if (data >= TASK_SIZE-7) break;
+			child->thread.debugreg0 = data;
+			ret = 0;
+			break;
+		case offsetof(struct user, u_debugreg[1]):
+			if (data >= TASK_SIZE-7) break;
+			child->thread.debugreg1 = data;
+			ret = 0;
+			break;
+		case offsetof(struct user, u_debugreg[2]):
+			if (data >= TASK_SIZE-7) break;
+			child->thread.debugreg2 = data;
+			ret = 0;
+			break;
+		case offsetof(struct user, u_debugreg[3]):
+			if (data >= TASK_SIZE-7) break;
+			child->thread.debugreg3 = data;
+			ret = 0;
+			break;
+		case offsetof(struct user, u_debugreg[6]):
+				  if (data >> 32)
+				break; 
+			child->thread.debugreg6 = data;
+			ret = 0;
+			break;
+		case offsetof(struct user, u_debugreg[7]):
+			/* See arch/i386/kernel/ptrace.c for an explanation of
+			 * this awkward check.*/
+				  data &= ~DR_CONTROL_RESERVED;
+				  for(i=0; i<4; i++)
+					  if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+					break;
+			if (i == 4) {
+				child->thread.debugreg7 = data;
+			  ret = 0;
+		  }
+		  break;
+		}
+		break;
+	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
+	case PTRACE_CONT: { /* restart after signal. */
+		long tmp;
+
+		ret = -EIO;
+		if ((unsigned long) data > _NSIG)
+			break;
+		if (request == PTRACE_SYSCALL)
+			set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+		else
+			clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+		child->exit_code = data;
+	/* make sure the single step bit is not set. */
+		tmp = get_stack_long(child, EFL_OFFSET);
+		tmp &= ~TRAP_FLAG;
+		put_stack_long(child, EFL_OFFSET,tmp);
+		wake_up_process(child);
+		ret = 0;
+		break;
+	}
+
+#ifdef CONFIG_IA32_EMULATION
+		/* This makes only sense with 32bit programs. Allow a
+		   64bit debugger to fully examine them too. Better
+		   don't use it against 64bit processes, use
+		   PTRACE_ARCH_PRCTL instead. */
+	case PTRACE_SET_THREAD_AREA: {
+		struct user_desc __user *p;
+		int old; 
+		p = (struct user_desc __user *)data;
+		get_user(old,  &p->entry_number); 
+		put_user(addr, &p->entry_number);
+		ret = do_set_thread_area(&child->thread, p);
+		put_user(old,  &p->entry_number); 
+		break;
+	case PTRACE_GET_THREAD_AREA:
+		p = (struct user_desc __user *)data;
+		get_user(old,  &p->entry_number); 
+		put_user(addr, &p->entry_number);
+		ret = do_get_thread_area(&child->thread, p);
+		put_user(old,  &p->entry_number); 
+		break;
+	} 
+#endif
+		/* normal 64bit interface to access TLS data. 
+		   Works just like arch_prctl, except that the arguments
+		   are reversed. */
+	case PTRACE_ARCH_PRCTL: 
+		ret = do_arch_prctl(child, data, addr);
+		break;
+
+/*
+ * make the child exit.  Best I can do is send it a sigkill. 
+ * perhaps it should be put in the status that it wants to 
+ * exit.
+ */
+	case PTRACE_KILL: {
+		long tmp;
+
+		ret = 0;
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
+			break;
+		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+		child->exit_code = SIGKILL;
+		/* make sure the single step bit is not set. */
+		tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
+		put_stack_long(child, EFL_OFFSET, tmp);
+		wake_up_process(child);
+		break;
+	}
+
+	case PTRACE_SINGLESTEP: {  /* set the trap flag. */
+		long tmp;
+
+		ret = -EIO;
+		if ((unsigned long) data > _NSIG)
+			break;
+		clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+		if ((child->ptrace & PT_DTRACE) == 0) {
+			/* Spurious delayed TF traps may occur */
+			child->ptrace |= PT_DTRACE;
+		}
+		tmp = get_stack_long(child, EFL_OFFSET) | TRAP_FLAG;
+		put_stack_long(child, EFL_OFFSET, tmp);
+		set_tsk_thread_flag(child, TIF_SINGLESTEP);
+		child->exit_code = data;
+		/* give it a chance to run. */
+		wake_up_process(child);
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_DETACH:
+		/* detach a process that was attached. */
+		ret = ptrace_detach(child, data);
+		break;
+
+	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+	  	if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
+			       sizeof(struct user_regs_struct))) {
+			ret = -EIO;
+			break;
+		}
+		ret = 0;
+		for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
+			ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
+			data += sizeof(long);
+		}
+		break;
+	}
+
+	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+		unsigned long tmp;
+	  	if (!access_ok(VERIFY_READ, (unsigned __user *)data,
+			       sizeof(struct user_regs_struct))) {
+			ret = -EIO;
+			break;
+		}
+		ret = 0;
+		for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
+			ret |= __get_user(tmp, (unsigned long __user *) data);
+			putreg(child, ui, tmp);
+			data += sizeof(long);
+		}
+		break;
+	}
+
+	case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
+		if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
+			       sizeof(struct user_i387_struct))) {
+			ret = -EIO;
+			break;
+		}
+		ret = get_fpregs((struct user_i387_struct __user *)data, child);
+		break;
+	}
+
+	case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
+		if (!access_ok(VERIFY_READ, (unsigned __user *)data,
+			       sizeof(struct user_i387_struct))) {
+			ret = -EIO;
+			break;
+		}
+		set_stopped_child_used_math(child);
+		ret = set_fpregs(child, (struct user_i387_struct __user *)data);
+		break;
+	}
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+		break;
+	}
+out_tsk:
+	put_task_struct(child);
+out:
+	unlock_kernel();
+	return ret;
+}
+
+static void syscall_trace(struct pt_regs *regs)
+{
+
+#if 0
+	printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+	       current->comm,
+	       regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
+	       current_thread_info()->flags, current->ptrace); 
+#endif
+
+	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+				? 0x80 : 0));
+	/*
+	 * this isn't the same as continuing with a signal, but it will do
+	 * for normal use.  strace only continues with a signal if the
+	 * stopping signal is not SIGTRAP.  -brl
+	 */
+	if (current->exit_code) {
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+}
+
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+	/* do the secure computing check first */
+	secure_computing(regs->orig_rax);
+
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, regs->orig_rax,
+				    regs->rdi, regs->rsi,
+				    regs->rdx, regs->r10);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
+
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, regs->rax);
+
+	if ((test_thread_flag(TIF_SYSCALL_TRACE)
+	     || test_thread_flag(TIF_SINGLESTEP))
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
new file mode 100644
index 000000000000..be4b36f762cf
--- /dev/null
+++ b/arch/x86_64/kernel/reboot.c
@@ -0,0 +1,163 @@
+/* Various gunk just to reboot the machine. */ 
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <asm/io.h>
+#include <asm/kdebug.h>
+#include <asm/delay.h>
+#include <asm/hw_irq.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/apic.h>
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+
+static long no_idt[3];
+static enum { 
+	BOOT_TRIPLE = 't',
+	BOOT_KBD = 'k'
+} reboot_type = BOOT_KBD;
+static int reboot_mode = 0;
+int reboot_force;
+
+/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
+   warm   Don't set the cold reboot flag
+   cold   Set the cold reboot flag
+   triple Force a triple fault (init)
+   kbd    Use the keyboard controller. cold reset (default)
+   force  Avoid anything that could hang.
+ */ 
+static int __init reboot_setup(char *str)
+{
+	for (;;) {
+		switch (*str) {
+		case 'w': 
+			reboot_mode = 0x1234;
+			break;
+
+		case 'c':
+			reboot_mode = 0;
+			break;
+
+		case 't':
+		case 'b':
+		case 'k':
+			reboot_type = *str;
+			break;
+		case 'f':
+			reboot_force = 1;
+			break;
+		}
+		if((str = strchr(str,',')) != NULL)
+			str++;
+		else
+			break;
+	}
+	return 1;
+}
+
+__setup("reboot=", reboot_setup);
+
+#ifdef CONFIG_SMP
+static void smp_halt(void)
+{
+	int cpuid = safe_smp_processor_id(); 
+	static int first_entry = 1;
+
+	if (reboot_force)
+		return;
+
+	if (first_entry) {
+		first_entry = 0;
+		smp_call_function((void *)machine_restart, NULL, 1, 0);
+	}
+			
+	smp_stop_cpu(); 
+
+	/* AP calling this. Just halt */
+	if (cpuid != boot_cpu_id) { 
+		for (;;) 
+			asm("hlt");
+	}
+
+	/* Wait for all other CPUs to have run smp_stop_cpu */
+	while (!cpus_empty(cpu_online_map))
+		rep_nop(); 
+}
+#endif
+
+static inline void kb_wait(void)
+{
+	int i;
+
+	for (i=0; i<0x10000; i++)
+		if ((inb_p(0x64) & 0x02) == 0)
+			break;
+}
+
+void machine_restart(char * __unused)
+{
+	int i;
+
+	printk("machine restart\n");
+
+#ifdef CONFIG_SMP
+	smp_halt(); 
+#endif
+
+	if (!reboot_force) {
+		local_irq_disable();
+#ifndef CONFIG_SMP
+		disable_local_APIC();
+#endif
+		disable_IO_APIC();
+		local_irq_enable();
+	}
+	
+	/* Tell the BIOS if we want cold or warm reboot */
+	*((unsigned short *)__va(0x472)) = reboot_mode;
+       
+	for (;;) {
+		/* Could also try the reset bit in the Hammer NB */
+		switch (reboot_type) { 
+		case BOOT_KBD:
+		for (i=0; i<100; i++) {
+			kb_wait();
+			udelay(50);
+			outb(0xfe,0x64);         /* pulse reset low */
+			udelay(50);
+		}
+
+		case BOOT_TRIPLE: 
+			__asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
+			__asm__ __volatile__("int3");
+
+			reboot_type = BOOT_KBD;
+			break;
+		}      
+	}      
+}
+
+EXPORT_SYMBOL(machine_restart);
+
+void machine_halt(void)
+{
+}
+
+EXPORT_SYMBOL(machine_halt);
+
+void machine_power_off(void)
+{
+	if (pm_power_off)
+		pm_power_off();
+}
+
+EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
new file mode 100644
index 000000000000..48f7c18172b9
--- /dev/null
+++ b/arch/x86_64/kernel/semaphore.c
@@ -0,0 +1,180 @@
+/*
+ * x86_64 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <asm/errno.h>
+
+#include <asm/semaphore.h>
+
+/*
+ * Semaphores are implemented using a two-way counter:
+ * The "count" variable is decremented for each process
+ * that tries to acquire the semaphore, while the "sleeping"
+ * variable is a count of such acquires.
+ *
+ * Notably, the inline "up()" and "down()" functions can
+ * efficiently test if they need to do any extra work (up
+ * needs to do something only if count was negative before
+ * the increment operation.
+ *
+ * "sleeping" and the contention routine ordering is protected
+ * by the spinlock in the semaphore's waitqueue head.
+ *
+ * Note that these functions are only called when there is
+ * contention on the lock, and as such all this is the
+ * "non-critical" part of the whole semaphore business. The
+ * critical part is the inline stuff in <asm/semaphore.h>
+ * where we want to avoid any extra jumps and calls.
+ */
+
+/*
+ * Logic:
+ *  - only on a boundary condition do we need to care. When we go
+ *    from a negative count to a non-negative, we wake people up.
+ *  - when we go from a non-negative count to a negative do we
+ *    (a) synchronize with the "sleeper" count and (b) make sure
+ *    that we're on the wakeup list before we synchronize so that
+ *    we cannot lose wakeup events.
+ */
+
+void __up(struct semaphore *sem)
+{
+	wake_up(&sem->wait);
+}
+
+void __sched __down(struct semaphore * sem)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	unsigned long flags;
+
+	tsk->state = TASK_UNINTERRUPTIBLE;
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+	sem->sleepers++;
+	for (;;) {
+		int sleepers = sem->sleepers;
+
+		/*
+		 * Add "everybody else" into it. They aren't
+		 * playing, because we own the spinlock in
+		 * the wait_queue_head.
+		 */
+		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+			sem->sleepers = 0;
+			break;
+		}
+		sem->sleepers = 1;	/* us - see -1 above */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		schedule();
+
+		spin_lock_irqsave(&sem->wait.lock, flags);
+		tsk->state = TASK_UNINTERRUPTIBLE;
+	}
+	remove_wait_queue_locked(&sem->wait, &wait);
+	wake_up_locked(&sem->wait);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	tsk->state = TASK_RUNNING;
+}
+
+int __sched __down_interruptible(struct semaphore * sem)
+{
+	int retval = 0;
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	unsigned long flags;
+
+	tsk->state = TASK_INTERRUPTIBLE;
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+	sem->sleepers++;
+	for (;;) {
+		int sleepers = sem->sleepers;
+
+		/*
+		 * With signals pending, this turns into
+		 * the trylock failure case - we won't be
+		 * sleeping, and we* can't get the lock as
+		 * it has contention. Just correct the count
+		 * and exit.
+		 */
+		if (signal_pending(current)) {
+			retval = -EINTR;
+			sem->sleepers = 0;
+			atomic_add(sleepers, &sem->count);
+			break;
+		}
+
+		/*
+		 * Add "everybody else" into it. They aren't
+		 * playing, because we own the spinlock in
+		 * wait_queue_head. The "-1" is because we're
+		 * still hoping to get the semaphore.
+		 */
+		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+			sem->sleepers = 0;
+			break;
+		}
+		sem->sleepers = 1;	/* us - see -1 above */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		schedule();
+
+		spin_lock_irqsave(&sem->wait.lock, flags);
+		tsk->state = TASK_INTERRUPTIBLE;
+	}
+	remove_wait_queue_locked(&sem->wait, &wait);
+	wake_up_locked(&sem->wait);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	tsk->state = TASK_RUNNING;
+	return retval;
+}
+
+/*
+ * Trylock failed - make sure we correct for
+ * having decremented the count.
+ *
+ * We could have done the trylock with a
+ * single "cmpxchg" without failure cases,
+ * but then it wouldn't work on a 386.
+ */
+int __down_trylock(struct semaphore * sem)
+{
+	int sleepers;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	sleepers = sem->sleepers + 1;
+	sem->sleepers = 0;
+
+	/*
+	 * Add "everybody else" and us into it. They aren't
+	 * playing, because we own the spinlock in the
+	 * wait_queue_head.
+	 */
+	if (!atomic_add_negative(sleepers, &sem->count)) {
+		wake_up_locked(&sem->wait);
+	}
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	return 1;
+}
+
+
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
new file mode 100644
index 000000000000..e50fc41de6b9
--- /dev/null
+++ b/arch/x86_64/kernel/setup.c
@@ -0,0 +1,1189 @@
+/*
+ *  linux/arch/x86-64/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Nov 2001 Dave Jones <davej@suse.de>
+ *  Forked from i386 setup code.
+ *
+ *  $Id$
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/tty.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <linux/console.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/kallsyms.h>
+#include <linux/edd.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <video/edid.h>
+#include <asm/e820.h>
+#include <asm/dma.h>
+#include <asm/mpspec.h>
+#include <asm/mmu_context.h>
+#include <asm/bootsetup.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/mach_apic.h>
+#include <asm/numa.h>
+
+/*
+ * Machine setup..
+ */
+
+struct cpuinfo_x86 boot_cpu_data;
+
+unsigned long mmu_cr4_features;
+
+int acpi_disabled;
+EXPORT_SYMBOL(acpi_disabled);
+#ifdef	CONFIG_ACPI_BOOT
+extern int __initdata acpi_ht;
+extern acpi_interrupt_flags	acpi_sci_flags;
+int __initdata acpi_force = 0;
+#endif
+
+int acpi_numa __initdata;
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
+unsigned long saved_video_mode;
+
+#ifdef CONFIG_SWIOTLB
+int swiotlb;
+EXPORT_SYMBOL(swiotlb);
+#endif
+
+/*
+ * Setup options
+ */
+struct drive_info_struct { char dummy[32]; } drive_info;
+struct screen_info screen_info;
+struct sys_desc_table_struct {
+	unsigned short length;
+	unsigned char table[0];
+};
+
+struct edid_info edid_info;
+struct e820map e820;
+
+extern int root_mountflags;
+extern char _text, _etext, _edata, _end;
+
+char command_line[COMMAND_LINE_SIZE];
+
+struct resource standard_io_resources[] = {
+	{ .name = "dma1", .start = 0x00, .end = 0x1f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic1", .start = 0x20, .end = 0x21,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer0", .start = 0x40, .end = 0x43,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer1", .start = 0x50, .end = 0x53,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x60, .end = 0x6f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "fpu", .start = 0xf0, .end = 0xff,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+#define STANDARD_IO_RESOURCES \
+	(sizeof standard_io_resources / sizeof standard_io_resources[0])
+
+#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
+
+struct resource data_resource = {
+	.name = "Kernel data",
+	.start = 0,
+	.end = 0,
+	.flags = IORESOURCE_RAM,
+};
+struct resource code_resource = {
+	.name = "Kernel code",
+	.start = 0,
+	.end = 0,
+	.flags = IORESOURCE_RAM,
+};
+
+#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
+
+static struct resource system_rom_resource = {
+	.name = "System ROM",
+	.start = 0xf0000,
+	.end = 0xfffff,
+	.flags = IORESOURCE_ROM,
+};
+
+static struct resource extension_rom_resource = {
+	.name = "Extension ROM",
+	.start = 0xe0000,
+	.end = 0xeffff,
+	.flags = IORESOURCE_ROM,
+};
+
+static struct resource adapter_rom_resources[] = {
+	{ .name = "Adapter ROM", .start = 0xc8000, .end = 0,
+		.flags = IORESOURCE_ROM },
+	{ .name = "Adapter ROM", .start = 0, .end = 0,
+		.flags = IORESOURCE_ROM },
+	{ .name = "Adapter ROM", .start = 0, .end = 0,
+		.flags = IORESOURCE_ROM },
+	{ .name = "Adapter ROM", .start = 0, .end = 0,
+		.flags = IORESOURCE_ROM },
+	{ .name = "Adapter ROM", .start = 0, .end = 0,
+		.flags = IORESOURCE_ROM },
+	{ .name = "Adapter ROM", .start = 0, .end = 0,
+		.flags = IORESOURCE_ROM }
+};
+
+#define ADAPTER_ROM_RESOURCES \
+	(sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
+
+static struct resource video_rom_resource = {
+	.name = "Video ROM",
+	.start = 0xc0000,
+	.end = 0xc7fff,
+	.flags = IORESOURCE_ROM,
+};
+
+static struct resource video_ram_resource = {
+	.name = "Video RAM area",
+	.start = 0xa0000,
+	.end = 0xbffff,
+	.flags = IORESOURCE_RAM,
+};
+
+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
+
+static int __init romchecksum(unsigned char *rom, unsigned long length)
+{
+	unsigned char *p, sum = 0;
+
+	for (p = rom; p < rom + length; p++)
+		sum += *p;
+	return sum == 0;
+}
+
+static void __init probe_roms(void)
+{
+	unsigned long start, length, upper;
+	unsigned char *rom;
+	int	      i;
+
+	/* video rom */
+	upper = adapter_rom_resources[0].start;
+	for (start = video_rom_resource.start; start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		video_rom_resource.start = start;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = rom[2] * 512;
+
+		/* if checksum okay, trust length byte */
+		if (length && romchecksum(rom, length))
+			video_rom_resource.end = start + length - 1;
+
+		request_resource(&iomem_resource, &video_rom_resource);
+		break;
+			}
+
+	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+	if (start < upper)
+		start = upper;
+
+	/* system rom */
+	request_resource(&iomem_resource, &system_rom_resource);
+	upper = system_rom_resource.start;
+
+	/* check for extension rom (ignore length byte!) */
+	rom = isa_bus_to_virt(extension_rom_resource.start);
+	if (romsignature(rom)) {
+		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		if (romchecksum(rom, length)) {
+			request_resource(&iomem_resource, &extension_rom_resource);
+			upper = extension_rom_resource.start;
+		}
+	}
+
+	/* check for adapter roms on 2k boundaries */
+	for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = rom[2] * 512;
+
+		/* but accept any length that fits if checksum okay */
+		if (!length || start + length > upper || !romchecksum(rom, length))
+			continue;
+
+		adapter_rom_resources[i].start = start;
+		adapter_rom_resources[i].end = start + length - 1;
+		request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+		start = adapter_rom_resources[i++].end & ~2047UL;
+	}
+}
+
+static __init void parse_cmdline_early (char ** cmdline_p)
+{
+	char c = ' ', *to = command_line, *from = COMMAND_LINE;
+	int len = 0;
+
+	/* Save unparsed command line copy for /proc/cmdline */
+	memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE);
+	saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
+
+	for (;;) {
+		if (c != ' ') 
+			goto next_char; 
+
+#ifdef  CONFIG_SMP
+		/*
+		 * If the BIOS enumerates physical processors before logical,
+		 * maxcpus=N at enumeration-time can be used to disable HT.
+		 */
+		else if (!memcmp(from, "maxcpus=", 8)) {
+			extern unsigned int maxcpus;
+
+			maxcpus = simple_strtoul(from + 8, NULL, 0);
+		}
+#endif
+#ifdef CONFIG_ACPI_BOOT
+		/* "acpi=off" disables both ACPI table parsing and interpreter init */
+		if (!memcmp(from, "acpi=off", 8))
+			disable_acpi();
+
+		if (!memcmp(from, "acpi=force", 10)) { 
+			/* add later when we do DMI horrors: */
+			acpi_force = 1;
+			acpi_disabled = 0;
+		}
+
+		/* acpi=ht just means: do ACPI MADT parsing 
+		   at bootup, but don't enable the full ACPI interpreter */
+		if (!memcmp(from, "acpi=ht", 7)) { 
+			if (!acpi_force)
+				disable_acpi();
+			acpi_ht = 1; 
+		}
+                else if (!memcmp(from, "pci=noacpi", 10)) 
+			acpi_disable_pci();
+		else if (!memcmp(from, "acpi=noirq", 10))
+			acpi_noirq_set();
+
+		else if (!memcmp(from, "acpi_sci=edge", 13))
+			acpi_sci_flags.trigger =  1;
+		else if (!memcmp(from, "acpi_sci=level", 14))
+			acpi_sci_flags.trigger = 3;
+		else if (!memcmp(from, "acpi_sci=high", 13))
+			acpi_sci_flags.polarity = 1;
+		else if (!memcmp(from, "acpi_sci=low", 12))
+			acpi_sci_flags.polarity = 3;
+
+		/* acpi=strict disables out-of-spec workarounds */
+		else if (!memcmp(from, "acpi=strict", 11)) {
+			acpi_strict = 1;
+		}
+#endif
+
+		if (!memcmp(from, "nolapic", 7) ||
+		    !memcmp(from, "disableapic", 11))
+			disable_apic = 1;
+
+		if (!memcmp(from, "noapic", 6)) 
+			skip_ioapic_setup = 1;
+
+		if (!memcmp(from, "apic", 4)) { 
+			skip_ioapic_setup = 0;
+			ioapic_force = 1;
+		}
+			
+		if (!memcmp(from, "mem=", 4))
+			parse_memopt(from+4, &from); 
+
+#ifdef CONFIG_DISCONTIGMEM
+		if (!memcmp(from, "numa=", 5))
+			numa_setup(from+5); 
+#endif
+
+#ifdef CONFIG_GART_IOMMU 
+		if (!memcmp(from,"iommu=",6)) { 
+			iommu_setup(from+6); 
+		}
+#endif
+
+		if (!memcmp(from,"oops=panic", 10))
+			panic_on_oops = 1;
+
+		if (!memcmp(from, "noexec=", 7))
+			nonx_setup(from + 7);
+
+	next_char:
+		c = *(from++);
+		if (!c)
+			break;
+		if (COMMAND_LINE_SIZE <= ++len)
+			break;
+		*(to++) = c;
+	}
+	*to = '\0';
+	*cmdline_p = command_line;
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+static void __init contig_initmem_init(void)
+{
+        unsigned long bootmap_size, bootmap; 
+        bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+        bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+        if (bootmap == -1L) 
+                panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+        bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+        e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); 
+        reserve_bootmem(bootmap, bootmap_size);
+} 
+#endif
+
+/* Use inline assembly to define this because the nops are defined 
+   as inline assembly strings in the include files and we cannot 
+   get them easily into strings. */
+asm("\t.data\nk8nops: " 
+    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+    K8_NOP7 K8_NOP8); 
+    
+extern unsigned char k8nops[];
+static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
+     NULL,
+     k8nops,
+     k8nops + 1,
+     k8nops + 1 + 2,
+     k8nops + 1 + 2 + 3,
+     k8nops + 1 + 2 + 3 + 4,
+     k8nops + 1 + 2 + 3 + 4 + 5,
+     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+}; 
+
+/* Replace instructions with better alternatives for this CPU type.
+
+   This runs before SMP is initialized to avoid SMP problems with
+   self modifying code. This implies that assymetric systems where
+   APs have less capabilities than the boot processor are not handled. 
+   In this case boot with "noreplacement". */ 
+void apply_alternatives(void *start, void *end) 
+{ 
+	struct alt_instr *a; 
+	int diff, i, k;
+	for (a = start; (void *)a < end; a++) { 
+		if (!boot_cpu_has(a->cpuid))
+			continue;
+
+		BUG_ON(a->replacementlen > a->instrlen); 
+		__inline_memcpy(a->instr, a->replacement, a->replacementlen); 
+		diff = a->instrlen - a->replacementlen; 
+
+		/* Pad the rest with nops */
+		for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
+			k = diff;
+			if (k > ASM_NOP_MAX)
+				k = ASM_NOP_MAX;
+			__inline_memcpy(a->instr + i, k8_nops[k], k); 
+		} 
+	}
+} 
+
+static int no_replacement __initdata = 0; 
+ 
+void __init alternative_instructions(void)
+{
+	extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+	if (no_replacement) 
+		return;
+	apply_alternatives(__alt_instructions, __alt_instructions_end);
+}
+
+static int __init noreplacement_setup(char *s)
+{ 
+     no_replacement = 1; 
+     return 0; 
+} 
+
+__setup("noreplacement", noreplacement_setup); 
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
+     edd.edd_info_nr = EDD_NR;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+#define EBDA_ADDR_POINTER 0x40E
+static void __init reserve_ebda_region(void)
+{
+	unsigned int addr;
+	/** 
+	 * there is a real-mode segmented pointer pointing to the 
+	 * 4K EBDA area at 0x40E
+	 */
+	addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
+	addr <<= 4;
+	if (addr)
+		reserve_bootmem_generic(addr, PAGE_SIZE);
+}
+
+void __init setup_arch(char **cmdline_p)
+{
+	unsigned long low_mem_size;
+	unsigned long kernel_end;
+
+ 	ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+ 	drive_info = DRIVE_INFO;
+ 	screen_info = SCREEN_INFO;
+	edid_info = EDID_INFO;
+	saved_video_mode = SAVED_VIDEO_MODE;
+	bootloader_type = LOADER_TYPE;
+
+#ifdef CONFIG_BLK_DEV_RAM
+	rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
+	rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
+	rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
+#endif
+	setup_memory_region();
+	copy_edd();
+
+	if (!MOUNT_ROOT_RDONLY)
+		root_mountflags &= ~MS_RDONLY;
+	init_mm.start_code = (unsigned long) &_text;
+	init_mm.end_code = (unsigned long) &_etext;
+	init_mm.end_data = (unsigned long) &_edata;
+	init_mm.brk = (unsigned long) &_end;
+
+	code_resource.start = virt_to_phys(&_text);
+	code_resource.end = virt_to_phys(&_etext)-1;
+	data_resource.start = virt_to_phys(&_etext);
+	data_resource.end = virt_to_phys(&_edata)-1;
+
+	parse_cmdline_early(cmdline_p);
+
+	early_identify_cpu(&boot_cpu_data);
+
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	end_pfn = e820_end_of_ram();
+
+	check_efer();
+
+	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+
+#ifdef CONFIG_ACPI_BOOT
+	/*
+	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
+	 * Call this early for SRAT node setup.
+	 */
+	acpi_boot_table_init();
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+	/*
+	 * Parse SRAT to discover nodes.
+	 */
+	acpi_numa_init();
+#endif
+
+#ifdef CONFIG_DISCONTIGMEM
+	numa_initmem_init(0, end_pfn); 
+#else
+	contig_initmem_init(); 
+#endif
+
+	/* Reserve direct mapping */
+	reserve_bootmem_generic(table_start << PAGE_SHIFT, 
+				(table_end - table_start) << PAGE_SHIFT);
+
+	/* reserve kernel */
+	kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
+	reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
+
+	/*
+	 * reserve physical page 0 - it's a special BIOS page on many boxes,
+	 * enabling clean reboots, SMP operation, laptop functions.
+	 */
+	reserve_bootmem_generic(0, PAGE_SIZE);
+
+	/* reserve ebda region */
+	reserve_ebda_region();
+
+#ifdef CONFIG_SMP
+	/*
+	 * But first pinch a few for the stack/trampoline stuff
+	 * FIXME: Don't need the extra page at 4K, but need to fix
+	 * trampoline before removing it. (see the GDT stuff)
+	 */
+	reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
+
+	/* Reserve SMP trampoline */
+	reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
+#endif
+
+#ifdef CONFIG_ACPI_SLEEP
+       /*
+        * Reserve low memory region for sleep support.
+        */
+       acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+#endif
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (LOADER_TYPE && INITRD_START) {
+		if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
+			reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
+			initrd_start =
+				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+			initrd_end = initrd_start+INITRD_SIZE;
+		}
+		else {
+			printk(KERN_ERR "initrd extends beyond end of memory "
+			    "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+			    (unsigned long)(INITRD_START + INITRD_SIZE),
+			    (unsigned long)(end_pfn << PAGE_SHIFT));
+			initrd_start = 0;
+		}
+	}
+#endif
+	paging_init();
+
+	check_ioapic();
+
+#ifdef CONFIG_ACPI_BOOT
+	/*
+	 * Read APIC and some other early information from ACPI tables.
+	 */
+	acpi_boot_init();
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		get_smp_config();
+	init_apic_mappings();
+#endif
+
+	/*
+	 * Request address space for all standard RAM and ROM resources
+	 * and also for regions reported as reserved by the e820.
+	 */
+	probe_roms();
+	e820_reserve_resources(); 
+
+	request_resource(&iomem_resource, &video_ram_resource);
+
+	{
+	unsigned i;
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+	}
+
+	/* Will likely break when you have unassigned resources with more
+	   than 4GB memory and bridges that don't support more than 4GB. 
+	   Doing it properly would require to use pci_alloc_consistent
+	   in this case. */
+	low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff;
+	if (low_mem_size > pci_mem_start)
+		pci_mem_start = low_mem_size;
+
+#ifdef CONFIG_GART_IOMMU
+       iommu_hole_init();
+#endif
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+	conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+	conswitchp = &dummy_con;
+#endif
+#endif
+}
+
+static int __init get_model_name(struct cpuinfo_x86 *c)
+{
+	unsigned int *v;
+
+	if (c->x86_cpuid_level < 0x80000004)
+		return 0;
+
+	v = (unsigned int *) c->x86_model_id;
+	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+	c->x86_model_id[48] = 0;
+	return 1;
+}
+
+
+static void __init display_cacheinfo(struct cpuinfo_x86 *c)
+{
+	unsigned int n, dummy, eax, ebx, ecx, edx;
+
+	n = c->x86_cpuid_level;
+
+	if (n >= 0x80000005) {
+		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
+			edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+		c->x86_cache_size=(ecx>>24)+(edx>>24);
+		/* On K8 L1 TLB is inclusive, so don't count it */
+		c->x86_tlbsize = 0;
+	}
+
+	if (n >= 0x80000006) {
+		cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+		ecx = cpuid_ecx(0x80000006);
+		c->x86_cache_size = ecx >> 16;
+		c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+
+		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+		c->x86_cache_size, ecx & 0xFF);
+	}
+
+	if (n >= 0x80000007)
+		cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
+	if (n >= 0x80000008) {
+		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+}
+
+
+static int __init init_amd(struct cpuinfo_x86 *c)
+{
+	int r;
+	int level;
+#ifdef CONFIG_NUMA
+	int cpu;
+#endif
+
+	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+	clear_bit(0*32+31, &c->x86_capability);
+	
+	/* C-stepping K8? */
+	level = cpuid_eax(1);
+	if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+		set_bit(X86_FEATURE_K8_C, &c->x86_capability);
+
+	r = get_model_name(c);
+	if (!r) { 
+		switch (c->x86) { 
+		case 15:
+			/* Should distinguish Models here, but this is only
+			   a fallback anyways. */
+			strcpy(c->x86_model_id, "Hammer");
+			break; 
+		} 
+	} 
+	display_cacheinfo(c);
+
+	if (c->x86_cpuid_level >= 0x80000008) {
+		c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+		if (c->x86_num_cores & (c->x86_num_cores - 1))
+			c->x86_num_cores = 1;
+
+#ifdef CONFIG_NUMA
+		/* On a dual core setup the lower bits of apic id
+		   distingush the cores. Fix up the CPU<->node mappings
+		   here based on that.
+		   Assumes number of cores is a power of two.
+		   When using SRAT use mapping from SRAT. */
+		cpu = c->x86_apicid;
+		if (acpi_numa <= 0 && c->x86_num_cores > 1) {
+			cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1);
+			if (!node_online(cpu_to_node[cpu]))
+				cpu_to_node[cpu] = first_node(node_online_map);
+		}
+		printk(KERN_INFO "CPU %d(%d) -> Node %d\n",
+				cpu, c->x86_num_cores, cpu_to_node[cpu]);
+#endif
+	}
+
+	return r;
+}
+
+static void __init detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	u32 	eax, ebx, ecx, edx;
+	int 	index_lsb, index_msb, tmp;
+	int 	cpu = smp_processor_id();
+	
+	if (!cpu_has(c, X86_FEATURE_HT))
+		return;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	smp_num_siblings = (ebx & 0xff0000) >> 16;
+	
+	if (smp_num_siblings == 1) {
+		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+	} else if (smp_num_siblings > 1) {
+		index_lsb = 0;
+		index_msb = 31;
+		/*
+		 * At this point we only support two siblings per
+		 * processor package.
+		 */
+		if (smp_num_siblings > NR_CPUS) {
+			printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+			smp_num_siblings = 1;
+			return;
+		}
+		tmp = smp_num_siblings;
+		while ((tmp & 1) == 0) {
+			tmp >>=1 ;
+			index_lsb++;
+		}
+		tmp = smp_num_siblings;
+		while ((tmp & 0x80000000 ) == 0) {
+			tmp <<=1 ;
+			index_msb--;
+		}
+		if (index_lsb != index_msb )
+			index_msb++;
+		phys_proc_id[cpu] = phys_pkg_id(index_msb);
+		
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       phys_proc_id[cpu]);
+	}
+#endif
+}
+
+static void __init sched_cmp_hack(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* AMD dual core looks like HT but isn't really. Hide it from the
+	   scheduler. This works around problems with the domain scheduler.
+	   Also probably gives slightly better scheduling and disables
+	   SMT nice which is harmful on dual core.
+	   TBD tune the domain scheduler for dual core. */
+	if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY))
+		smp_num_siblings = 1;
+#endif
+}
+	
+static void __init init_intel(struct cpuinfo_x86 *c)
+{
+	/* Cache sizes */
+	unsigned n;
+
+	init_intel_cacheinfo(c);
+	n = c->x86_cpuid_level;
+	if (n >= 0x80000008) {
+		unsigned eax = cpuid_eax(0x80000008);
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+
+	if (c->x86 == 15)
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+}
+
+void __init get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+	char *v = c->x86_vendor_id;
+
+	if (!strcmp(v, "AuthenticAMD"))
+		c->x86_vendor = X86_VENDOR_AMD;
+	else if (!strcmp(v, "GenuineIntel"))
+		c->x86_vendor = X86_VENDOR_INTEL;
+	else
+		c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+
+struct cpu_model_info {
+	int vendor;
+	int family;
+	char *model_names[16];
+};
+
+/* Do some early cpuid on the boot CPU to get some parameter that are
+   needed before check_bugs. Everything advanced is in identify_cpu
+   below. */
+void __init early_identify_cpu(struct cpuinfo_x86 *c)
+{
+	u32 tfms;
+
+	c->loops_per_jiffy = loops_per_jiffy;
+	c->x86_cache_size = -1;
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
+	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
+	c->x86_vendor_id[0] = '\0'; /* Unset */
+	c->x86_model_id[0] = '\0';  /* Unset */
+	c->x86_clflush_size = 64;
+	c->x86_cache_alignment = c->x86_clflush_size;
+	c->x86_num_cores = 1;
+	c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data;
+	c->x86_cpuid_level = 0;
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+	/* Get vendor name */
+	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+	      (unsigned int *)&c->x86_vendor_id[0],
+	      (unsigned int *)&c->x86_vendor_id[8],
+	      (unsigned int *)&c->x86_vendor_id[4]);
+		
+	get_cpu_vendor(c);
+
+	/* Initialize the standard set of capabilities */
+	/* Note that the vendor-specific code below might override */
+
+	/* Intel-defined flags: level 0x00000001 */
+	if (c->cpuid_level >= 0x00000001) {
+		__u32 misc;
+		cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+		      &c->x86_capability[0]);
+		c->x86 = (tfms >> 8) & 0xf;
+		c->x86_model = (tfms >> 4) & 0xf;
+		c->x86_mask = tfms & 0xf;
+		if (c->x86 == 0xf) {
+			c->x86 += (tfms >> 20) & 0xff;
+			c->x86_model += ((tfms >> 16) & 0xF) << 4;
+		} 
+		if (c->x86_capability[0] & (1<<19)) 
+			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+		c->x86_apicid = misc >> 24;
+	} else {
+		/* Have CPUID level 0 only - unheard of */
+		c->x86 = 4;
+	}
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __init identify_cpu(struct cpuinfo_x86 *c)
+{
+	int i;
+	u32 xlvl;
+
+	early_identify_cpu(c);
+
+	/* AMD-defined flags: level 0x80000001 */
+	xlvl = cpuid_eax(0x80000000);
+	c->x86_cpuid_level = xlvl;
+	if ((xlvl & 0xffff0000) == 0x80000000) {
+		if (xlvl >= 0x80000001) {
+			c->x86_capability[1] = cpuid_edx(0x80000001);
+			c->x86_capability[5] = cpuid_ecx(0x80000001);
+		}
+		if (xlvl >= 0x80000004)
+			get_model_name(c); /* Default name */
+	}
+
+	/* Transmeta-defined flags: level 0x80860001 */
+	xlvl = cpuid_eax(0x80860000);
+	if ((xlvl & 0xffff0000) == 0x80860000) {
+		/* Don't set x86_cpuid_level here for now to not confuse. */
+		if (xlvl >= 0x80860001)
+			c->x86_capability[2] = cpuid_edx(0x80860001);
+	}
+
+	/*
+	 * Vendor-specific initialization.  In this section we
+	 * canonicalize the feature flags, meaning if there are
+	 * features a certain CPU supports which CPUID doesn't
+	 * tell us, CPUID claiming incorrect flags, or other bugs,
+	 * we handle them here.
+	 *
+	 * At the end of this section, c->x86_capability better
+	 * indicate the features this CPU genuinely supports!
+	 */
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		init_amd(c);
+		break;
+
+	case X86_VENDOR_INTEL:
+		init_intel(c);
+		break;
+
+	case X86_VENDOR_UNKNOWN:
+	default:
+		display_cacheinfo(c);
+		break;
+	}
+
+	select_idle_routine(c);
+	detect_ht(c); 
+	sched_cmp_hack(c);
+
+	/*
+	 * On SMP, boot_cpu_data holds the common feature set between
+	 * all CPUs; so make sure that we indicate which features are
+	 * common between the CPUs.  The first time this routine gets
+	 * executed, c == &boot_cpu_data.
+	 */
+	if (c != &boot_cpu_data) {
+		/* AND the already accumulated flags with these */
+		for (i = 0 ; i < NCAPINTS ; i++)
+			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+	}
+
+#ifdef CONFIG_X86_MCE
+	mcheck_init(c);
+#endif
+#ifdef CONFIG_NUMA
+	if (c != &boot_cpu_data)
+		numa_add_cpu(c - cpu_data);
+#endif
+}
+ 
+
+void __init print_cpu_info(struct cpuinfo_x86 *c)
+{
+	if (c->x86_model_id[0])
+		printk("%s", c->x86_model_id);
+
+	if (c->x86_mask || c->cpuid_level >= 0) 
+		printk(" stepping %02x\n", c->x86_mask);
+	else
+		printk("\n");
+}
+
+/*
+ *	Get CPU information for use by the procfs.
+ */
+
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+	struct cpuinfo_x86 *c = v;
+
+	/* 
+	 * These flag bits must match the definitions in <asm/cpufeature.h>.
+	 * NULL means this bit is undefined or reserved; either way it doesn't
+	 * have meaning as far as Linux is concerned.  Note that it's important
+	 * to realize there is a difference between this table and CPUID -- if
+	 * applications want to get the raw CPUID data, they should access
+	 * /dev/cpu/<cpu_nr>/cpuid instead.
+	 */
+	static char *x86_cap_flags[] = {
+		/* Intel-defined */
+	        "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+	        "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+	        "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+	        "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
+
+		/* AMD-defined */
+		"pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
+		NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow",
+
+		/* Transmeta-defined */
+		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+		/* Other (Linux-defined) */
+		"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+		/* Intel-defined (#2) */
+		"pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est",
+		"tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+		/* AMD-defined (#2) */
+		"lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+	};
+	static char *x86_power_flags[] = { 
+		"ts",	/* temperature sensor */
+		"fid",  /* frequency id control */
+		"vid",  /* voltage id control */
+		"ttp",  /* thermal trip */
+		"tm",
+		"stc"
+	};
+
+
+#ifdef CONFIG_SMP
+	if (!cpu_online(c-cpu_data))
+		return 0;
+#endif
+
+	seq_printf(m,"processor\t: %u\n"
+		     "vendor_id\t: %s\n"
+		     "cpu family\t: %d\n"
+		     "model\t\t: %d\n"
+		     "model name\t: %s\n",
+		     (unsigned)(c-cpu_data),
+		     c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+		     c->x86,
+		     (int)c->x86_model,
+		     c->x86_model_id[0] ? c->x86_model_id : "unknown");
+	
+	if (c->x86_mask || c->cpuid_level >= 0)
+		seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+	else
+		seq_printf(m, "stepping\t: unknown\n");
+	
+	if (cpu_has(c,X86_FEATURE_TSC)) {
+		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+			     cpu_khz / 1000, (cpu_khz % 1000));
+	}
+
+	/* Cache size */
+	if (c->x86_cache_size >= 0) 
+		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+	
+#ifdef CONFIG_SMP
+	seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]);
+	seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings);
+#endif	
+
+	seq_printf(m,
+	        "fpu\t\t: yes\n"
+	        "fpu_exception\t: yes\n"
+	        "cpuid level\t: %d\n"
+	        "wp\t\t: yes\n"
+	        "flags\t\t:",
+		   c->cpuid_level);
+
+	{ 
+		int i; 
+		for ( i = 0 ; i < 32*NCAPINTS ; i++ )
+			if ( test_bit(i, &c->x86_capability) &&
+			     x86_cap_flags[i] != NULL )
+				seq_printf(m, " %s", x86_cap_flags[i]);
+	}
+		
+	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+		   c->loops_per_jiffy/(500000/HZ),
+		   (c->loops_per_jiffy/(5000/HZ)) % 100);
+
+	if (c->x86_tlbsize > 0) 
+		seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+	seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
+	seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+
+	seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
+		   c->x86_phys_bits, c->x86_virt_bits);
+
+	seq_printf(m, "power management:");
+	{
+		unsigned i;
+		for (i = 0; i < 32; i++) 
+			if (c->x86_power & (1 << i)) {
+				if (i < ARRAY_SIZE(x86_power_flags))
+					seq_printf(m, " %s", x86_power_flags[i]);
+				else
+					seq_printf(m, " [%d]", i);
+			}
+	}
+	seq_printf(m, "\n");
+
+	if (c->x86_num_cores > 1)
+		seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores);
+
+	seq_printf(m, "\n\n"); 
+
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	return *pos < NR_CPUS ? cpu_data + *pos : NULL;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return c_start(m, pos);
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+
+struct seq_operations cpuinfo_op = {
+	.start =c_start,
+	.next =	c_next,
+	.stop =	c_stop,
+	.show =	show_cpuinfo,
+};
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
new file mode 100644
index 000000000000..b5305b04bc40
--- /dev/null
+++ b/arch/x86_64/kernel/setup64.c
@@ -0,0 +1,292 @@
+/* 
+ * X86-64 specific CPU setup.
+ * Copyright (C) 1995  Linus Torvalds
+ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
+ * See setup.c for older changelog.
+ * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
+ */ 
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/mmu_context.h>
+#include <asm/smp.h>
+#include <asm/i387.h>
+#include <asm/percpu.h>
+#include <asm/mtrr.h>
+#include <asm/proto.h>
+#include <asm/mman.h>
+#include <asm/numa.h>
+
+char x86_boot_params[2048] __initdata = {0,};
+
+cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+
+struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 
+
+extern struct task_struct init_task;
+
+extern unsigned char __per_cpu_start[], __per_cpu_end[]; 
+
+extern struct desc_ptr cpu_gdt_descr[];
+struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 
+
+char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
+
+unsigned long __supported_pte_mask = ~0UL;
+static int do_not_nx __initdata = 0;
+
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+
+on	Enable(default)
+off	Disable
+*/ 
+int __init nonx_setup(char *str)
+{
+	if (!strncmp(str, "on", 2)) {
+                __supported_pte_mask |= _PAGE_NX; 
+ 		do_not_nx = 0; 
+	} else if (!strncmp(str, "off", 3)) {
+		do_not_nx = 1;
+		__supported_pte_mask &= ~_PAGE_NX;
+        }
+	return 0;
+} 
+__setup("noexec=", nonx_setup);	/* parsed early actually */
+
+int force_personality32 = READ_IMPLIES_EXEC;
+
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+
+on	PROT_READ does not imply PROT_EXEC for 32bit processes
+off	PROT_READ implies PROT_EXEC (default)
+*/
+static int __init nonx32_setup(char *str)
+{
+	if (!strcmp(str, "on"))
+		force_personality32 &= ~READ_IMPLIES_EXEC;
+	else if (!strcmp(str, "off"))
+		force_personality32 |= READ_IMPLIES_EXEC;
+	return 0;
+}
+__setup("noexec32=", nonx32_setup);
+
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{ 
+	int i;
+	unsigned long size;
+
+	/* Copy section for each CPU (we discard the original) */
+	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
+#ifdef CONFIG_MODULES
+	if (size < PERCPU_ENOUGH_ROOM)
+		size = PERCPU_ENOUGH_ROOM;
+#endif
+
+	for (i = 0; i < NR_CPUS; i++) { 
+		unsigned char *ptr;
+
+		if (!NODE_DATA(cpu_to_node(i))) {
+			printk("cpu with no node %d, num_online_nodes %d\n",
+			       i, num_online_nodes());
+			ptr = alloc_bootmem(size);
+		} else { 
+			ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
+		}
+		if (!ptr)
+			panic("Cannot allocate cpu data for CPU %d\n", i);
+		cpu_pda[i].data_offset = ptr - __per_cpu_start;
+		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+	}
+} 
+
+void pda_init(int cpu)
+{ 
+	struct x8664_pda *pda = &cpu_pda[cpu];
+
+	/* Setup up data that may be needed in __get_free_pages early */
+	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
+	wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
+
+	pda->me = pda;
+	pda->cpunumber = cpu; 
+	pda->irqcount = -1;
+	pda->kernelstack = 
+		(unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
+	pda->active_mm = &init_mm;
+	pda->mmu_state = 0;
+
+	if (cpu == 0) {
+		/* others are initialized in smpboot.c */
+		pda->pcurrent = &init_task;
+		pda->irqstackptr = boot_cpu_stack; 
+	} else {
+		pda->irqstackptr = (char *)
+			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+		if (!pda->irqstackptr)
+			panic("cannot allocate irqstack for cpu %d", cpu); 
+	}
+
+	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+
+	pda->irqstackptr += IRQSTACKSIZE-64;
+} 
+
+char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] 
+__attribute__((section(".bss.page_aligned")));
+
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+	/* 
+	 * LSTAR and STAR live in a bit strange symbiosis.
+	 * They both write to the same internal register. STAR allows to set CS/DS
+	 * but only a 32bit target. LSTAR sets the 64bit rip. 	 
+	 */ 
+	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
+	wrmsrl(MSR_LSTAR, system_call); 
+
+#ifdef CONFIG_IA32_EMULATION   		
+	syscall32_cpu_init ();
+#endif
+
+	/* Flags to clear on syscall */
+	wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
+}
+
+void __init check_efer(void)
+{
+	unsigned long efer;
+
+	rdmsrl(MSR_EFER, efer); 
+        if (!(efer & EFER_NX) || do_not_nx) { 
+                __supported_pte_mask &= ~_PAGE_NX; 
+        }       
+}
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __init cpu_init (void)
+{
+#ifdef CONFIG_SMP
+	int cpu = stack_smp_processor_id();
+#else
+	int cpu = smp_processor_id();
+#endif
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	unsigned long v; 
+	char *estacks = NULL; 
+	struct task_struct *me;
+	int i;
+
+	/* CPU 0 is initialised in head64.c */
+	if (cpu != 0) {
+		pda_init(cpu);
+	} else 
+		estacks = boot_exception_stacks; 
+
+	me = current;
+
+	if (cpu_test_and_set(cpu, cpu_initialized))
+		panic("CPU#%d already initialized!\n", cpu);
+
+	printk("Initializing CPU#%d\n", cpu);
+
+		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+	/*
+	 * Initialize the per-CPU GDT with the boot GDT,
+	 * and set up the GDT descriptor:
+	 */
+	if (cpu) {
+		memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE);
+	}	
+
+	cpu_gdt_descr[cpu].size = GDT_SIZE;
+	cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu];
+	asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
+	asm volatile("lidt %0" :: "m" (idt_descr));
+
+	memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8);
+
+	/*
+	 * Delete NT
+	 */
+
+	asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax");
+
+	syscall_init();
+
+	wrmsrl(MSR_FS_BASE, 0);
+	wrmsrl(MSR_KERNEL_GS_BASE, 0);
+	barrier(); 
+
+	check_efer();
+
+	/*
+	 * set up and load the per-CPU TSS
+	 */
+	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+		if (cpu) {
+			estacks = (char *)__get_free_pages(GFP_ATOMIC, 
+						   EXCEPTION_STACK_ORDER);
+			if (!estacks)
+				panic("Cannot allocate exception stack %ld %d\n",
+				      v, cpu); 
+		}
+		estacks += EXCEPTION_STKSZ;
+		t->ist[v] = (unsigned long)estacks;
+	}
+
+	t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	/*
+	 * <= is required because the CPU will access up to
+	 * 8 bits beyond the end of the IO permission bitmap.
+	 */
+	for (i = 0; i <= IO_BITMAP_LONGS; i++)
+		t->io_bitmap[i] = ~0UL;
+
+	atomic_inc(&init_mm.mm_count);
+	me->active_mm = &init_mm;
+	if (me->mm)
+		BUG();
+	enter_lazy_tlb(&init_mm, me);
+
+	set_tss_desc(cpu, t);
+	load_TR_desc();
+	load_LDT(&init_mm.context);
+
+	/*
+	 * Clear all 6 debug registers:
+	 */
+
+	set_debug(0UL, 0);
+	set_debug(0UL, 1);
+	set_debug(0UL, 2);
+	set_debug(0UL, 3);
+	set_debug(0UL, 6);
+	set_debug(0UL, 7);
+
+	fpu_init(); 
+}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
new file mode 100644
index 000000000000..5ace32c6ac5c
--- /dev/null
+++ b/arch/x86_64/kernel/signal.c
@@ -0,0 +1,486 @@
+/*
+ *  linux/arch/x86_64/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ *  2000-2002   x86-64 support by Andi Kleen
+ * 
+ *  $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compiler.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/proto.h>
+
+/* #define DEBUG_SIG 1 */
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+               sigset_t *set, struct pt_regs * regs); 
+void ia32_setup_frame(int sig, struct k_sigaction *ka,
+            sigset_t *set, struct pt_regs * regs); 
+
+asmlinkage long
+sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
+{
+	sigset_t saveset, newset;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&newset, unewset, sizeof(newset)))
+		return -EFAULT;
+	sigdelsetmask(&newset, ~_BLOCKABLE);
+
+	spin_lock_irq(&current->sighand->siglock);
+	saveset = current->blocked;
+	current->blocked = newset;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+#ifdef DEBUG_SIG
+	printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
+		saveset, newset, regs, regs->rip);
+#endif 
+	regs->rax = -EINTR;
+	while (1) {
+		current->state = TASK_INTERRUPTIBLE;
+		schedule();
+		if (do_signal(regs, &saveset))
+			return -EINTR;
+	}
+}
+
+asmlinkage long
+sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+		struct pt_regs *regs)
+{
+	return do_sigaltstack(uss, uoss, regs->rsp);
+}
+
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+struct rt_sigframe
+{
+	char *pretcode;
+	struct ucontext uc;
+	struct siginfo info;
+};
+
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
+{
+	unsigned int err = 0;
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+#define COPY(x)		err |= __get_user(regs->x, &sc->x)
+
+	COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
+	COPY(rdx); COPY(rcx); COPY(rip);
+	COPY(r8);
+	COPY(r9);
+	COPY(r10);
+	COPY(r11);
+	COPY(r12);
+	COPY(r13);
+	COPY(r14);
+	COPY(r15);
+
+	{
+		unsigned int tmpflags;
+		err |= __get_user(tmpflags, &sc->eflags);
+		regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+		regs->orig_rax = -1;		/* disable syscall checks */
+	}
+
+	{
+		struct _fpstate __user * buf;
+		err |= __get_user(buf, &sc->fpstate);
+
+		if (buf) {
+			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+				goto badframe;
+			err |= restore_i387(buf);
+		} else {
+			struct task_struct *me = current;
+			if (used_math()) {
+				clear_fpu(me);
+				clear_used_math();
+			}
+		}
+	}
+
+	err |= __get_user(*prax, &sc->rax);
+	return err;
+
+badframe:
+	return 1;
+}
+
+asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+{
+	struct rt_sigframe __user *frame;
+	sigset_t set;
+	unsigned long eax;
+
+	frame = (struct rt_sigframe __user *)(regs->rsp - 8);
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
+		goto badframe;
+	} 
+	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { 
+		goto badframe;
+	} 
+
+	sigdelsetmask(&set, ~_BLOCKABLE);
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = set;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+		goto badframe;
+
+#ifdef DEBUG_SIG
+	printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax);
+#endif
+
+	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
+		goto badframe;
+
+	return eax;
+
+badframe:
+	signal_fault(regs,frame,"sigreturn");
+	return 0;
+}	
+
+/*
+ * Set up a signal frame.
+ */
+
+static inline int
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+{
+	int err = 0;
+	unsigned long eflags;
+
+	err |= __put_user(0, &sc->gs);
+	err |= __put_user(0, &sc->fs);
+
+	err |= __put_user(regs->rdi, &sc->rdi);
+	err |= __put_user(regs->rsi, &sc->rsi);
+	err |= __put_user(regs->rbp, &sc->rbp);
+	err |= __put_user(regs->rsp, &sc->rsp);
+	err |= __put_user(regs->rbx, &sc->rbx);
+	err |= __put_user(regs->rdx, &sc->rdx);
+	err |= __put_user(regs->rcx, &sc->rcx);
+	err |= __put_user(regs->rax, &sc->rax);
+	err |= __put_user(regs->r8, &sc->r8);
+	err |= __put_user(regs->r9, &sc->r9);
+	err |= __put_user(regs->r10, &sc->r10);
+	err |= __put_user(regs->r11, &sc->r11);
+	err |= __put_user(regs->r12, &sc->r12);
+	err |= __put_user(regs->r13, &sc->r13);
+	err |= __put_user(regs->r14, &sc->r14);
+	err |= __put_user(regs->r15, &sc->r15);
+	err |= __put_user(me->thread.trap_no, &sc->trapno);
+	err |= __put_user(me->thread.error_code, &sc->err);
+	err |= __put_user(regs->rip, &sc->rip);
+	eflags = regs->eflags;
+	if (current->ptrace & PT_PTRACED) {
+		eflags &= ~TF_MASK;
+	}
+	err |= __put_user(eflags, &sc->eflags);
+	err |= __put_user(mask, &sc->oldmask);
+	err |= __put_user(me->thread.cr2, &sc->cr2);
+
+	return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+
+static void __user *
+get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
+{
+	unsigned long rsp;
+
+	/* Default to using normal stack - redzone*/
+	rsp = regs->rsp - 128;
+
+	/* This is the X/Open sanctioned signal stack switching.  */
+	/* RED-PEN: redzone on that stack? */
+	if (ka->sa.sa_flags & SA_ONSTACK) {
+		if (sas_ss_flags(rsp) == 0)
+			rsp = current->sas_ss_sp + current->sas_ss_size;
+	}
+
+	return (void __user *)round_down(rsp - size, 16); 
+}
+
+static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+			   sigset_t *set, struct pt_regs * regs)
+{
+	struct rt_sigframe __user *frame;
+	struct _fpstate __user *fp = NULL; 
+	int err = 0;
+	struct task_struct *me = current;
+
+	if (used_math()) {
+		fp = get_stack(ka, regs, sizeof(struct _fpstate)); 
+		frame = (void __user *)round_down(
+			(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
+
+		if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
+			goto give_sigsegv;
+
+		if (save_i387(fp) < 0) 
+			err |= -1; 
+	} else
+		frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
+
+	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+		goto give_sigsegv;
+
+	if (ka->sa.sa_flags & SA_SIGINFO) { 
+		err |= copy_siginfo_to_user(&frame->info, info);
+		if (err)
+			goto give_sigsegv;
+	}
+		
+	/* Create the ucontext.  */
+	err |= __put_user(0, &frame->uc.uc_flags);
+	err |= __put_user(0, &frame->uc.uc_link);
+	err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+	err |= __put_user(sas_ss_flags(regs->rsp),
+			  &frame->uc.uc_stack.ss_flags);
+	err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
+	err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
+	err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
+	if (sizeof(*set) == 16) { 
+		__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
+		__put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 
+	} else
+		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+	/* Set up to return from userspace.  If provided, use a stub
+	   already in userspace.  */
+	/* x86-64 should always use SA_RESTORER. */
+	if (ka->sa.sa_flags & SA_RESTORER) {
+		err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+	} else {
+		/* could use a vstub here */
+		goto give_sigsegv; 
+	}
+
+	if (err)
+		goto give_sigsegv;
+
+#ifdef DEBUG_SIG
+	printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
+#endif
+
+	/* Set up registers for signal handler */
+	{ 
+		struct exec_domain *ed = current_thread_info()->exec_domain;
+		if (unlikely(ed && ed->signal_invmap && sig < 32))
+			sig = ed->signal_invmap[sig];
+	} 
+	regs->rdi = sig;
+	/* In case the signal handler was declared without prototypes */ 
+	regs->rax = 0;	
+
+	/* This also works for non SA_SIGINFO handlers because they expect the
+	   next argument after the signal number on the stack. */
+	regs->rsi = (unsigned long)&frame->info; 
+	regs->rdx = (unsigned long)&frame->uc; 
+	regs->rip = (unsigned long) ka->sa.sa_handler;
+
+	regs->rsp = (unsigned long)frame;
+
+	set_fs(USER_DS);
+	if (regs->eflags & TF_MASK) {
+		if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) {
+			ptrace_notify(SIGTRAP);
+		} else {
+			regs->eflags &= ~TF_MASK;
+		}
+	}
+
+#ifdef DEBUG_SIG
+	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+		current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+
+	return;
+
+give_sigsegv:
+	force_sigsegv(sig, current);
+}
+
+/*
+ * OK, we're invoking a handler
+ */	
+
+static void
+handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
+		sigset_t *oldset, struct pt_regs *regs)
+{
+#ifdef DEBUG_SIG
+	printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
+		current->pid, sig,
+		regs->rip, regs->rsp, regs);
+#endif
+
+	/* Are we from a system call? */
+	if ((long)regs->orig_rax >= 0) {
+		/* If so, check system call restarting.. */
+		switch (regs->rax) {
+		        case -ERESTART_RESTARTBLOCK:
+			case -ERESTARTNOHAND:
+				regs->rax = -EINTR;
+				break;
+
+			case -ERESTARTSYS:
+				if (!(ka->sa.sa_flags & SA_RESTART)) {
+					regs->rax = -EINTR;
+					break;
+				}
+				/* fallthrough */
+			case -ERESTARTNOINTR:
+				regs->rax = regs->orig_rax;
+				regs->rip -= 2;
+				break;
+		}
+	}
+
+#ifdef CONFIG_IA32_EMULATION
+	if (test_thread_flag(TIF_IA32)) {
+		if (ka->sa.sa_flags & SA_SIGINFO)
+			ia32_setup_rt_frame(sig, ka, info, oldset, regs);
+		else
+			ia32_setup_frame(sig, ka, oldset, regs);
+	} else 
+#endif
+	setup_rt_frame(sig, ka, info, oldset, regs);
+
+	if (!(ka->sa.sa_flags & SA_NODEFER)) {
+		spin_lock_irq(&current->sighand->siglock);
+		sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+		sigaddset(&current->blocked,sig);
+		recalc_sigpending();
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+int do_signal(struct pt_regs *regs, sigset_t *oldset)
+{
+	struct k_sigaction ka;
+	siginfo_t info;
+	int signr;
+
+	/*
+	 * We want the common case to go fast, which
+	 * is why we may in certain cases get here from
+	 * kernel mode. Just return without doing anything
+	 * if so.
+	 */
+	if ((regs->cs & 3) != 3)
+		return 1;
+
+	if (try_to_freeze(0))
+		goto no_signal;
+
+	if (!oldset)
+		oldset = &current->blocked;
+
+	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+	if (signr > 0) {
+		/* Reenable any watchpoints before delivering the
+		 * signal to user space. The processor register will
+		 * have been cleared if the watchpoint triggered
+		 * inside the kernel.
+		 */
+		if (current->thread.debugreg7)
+			asm volatile("movq %0,%%db7"	: : "r" (current->thread.debugreg7));
+
+		/* Whee!  Actually deliver the signal.  */
+		handle_signal(signr, &info, &ka, oldset, regs);
+		return 1;
+	}
+
+ no_signal:
+	/* Did we come from a system call? */
+	if ((long)regs->orig_rax >= 0) {
+		/* Restart the system call - no handlers present */
+		long res = regs->rax;
+		if (res == -ERESTARTNOHAND ||
+		    res == -ERESTARTSYS ||
+		    res == -ERESTARTNOINTR) {
+			regs->rax = regs->orig_rax;
+			regs->rip -= 2;
+		}
+		if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) {
+			regs->rax = __NR_restart_syscall;
+			regs->rip -= 2;
+		}
+	}
+	return 0;
+}
+
+void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags)
+{
+#ifdef DEBUG_SIG
+	printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
+	       thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 
+#endif
+	       
+	/* Pending single-step? */
+	if (thread_info_flags & _TIF_SINGLESTEP) {
+		regs->eflags |= TF_MASK;
+		clear_thread_flag(TIF_SINGLESTEP);
+	}
+
+	/* deal with pending signal delivery */
+	if (thread_info_flags & _TIF_SIGPENDING)
+		do_signal(regs,oldset);
+}
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{ 
+	struct task_struct *me = current; 
+	if (exception_trace)
+		printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
+	       me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
+
+	force_sig(SIGSEGV, me); 
+} 
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
new file mode 100644
index 000000000000..e3cdbf9a88bd
--- /dev/null
+++ b/arch/x86_64/kernel/smp.c
@@ -0,0 +1,415 @@
+/*
+ *	Intel SMP support routines.
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ *	This code is released under the GNU General Public License version 2 or
+ *	later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/smp_lock.h>
+#include <linux/smp.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/interrupt.h>
+
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mach_apic.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+/*
+ *	Smarter SMP flushing macros. 
+ *		c/o Linus Torvalds.
+ *
+ *	These mean you can really definitely utterly forget about
+ *	writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *	Optimizations Manfred Spraul <manfred@colorfullife.com>
+ */
+
+static cpumask_t flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+static DEFINE_SPINLOCK(tlbstate_lock);
+#define FLUSH_ALL	-1ULL
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context, 
+ * instead update mm->cpu_vm_mask.
+ */
+static inline void leave_mm (unsigned long cpu)
+{
+	if (read_pda(mmu_state) == TLBSTATE_OK)
+		BUG();
+	clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
+	load_cr3(swapper_pg_dir);
+}
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
+ * 	Stop ipi delivery for the old mm. This is not synchronized with
+ * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ * 	for the wrong mm, and in the worst case we perform a superfluous
+ * 	tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+ * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *	was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+ * 	Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
+ * 	Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *	cpu active_mm is correct, cpu0 already handles
+ *	flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ * 	Atomically set the bit [other cpus will start sending flush ipis],
+ * 	and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu mmu_state is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+
+asmlinkage void smp_invalidate_interrupt (void)
+{
+	unsigned long cpu;
+
+	cpu = get_cpu();
+
+	if (!cpu_isset(cpu, flush_cpumask))
+		goto out;
+		/* 
+		 * This was a BUG() but until someone can quote me the
+		 * line from the intel manual that guarantees an IPI to
+		 * multiple CPUs is retried _only_ on the erroring CPUs
+		 * its staying as a return
+		 *
+		 * BUG();
+		 */
+		 
+	if (flush_mm == read_pda(active_mm)) {
+		if (read_pda(mmu_state) == TLBSTATE_OK) {
+			if (flush_va == FLUSH_ALL)
+				local_flush_tlb();
+			else
+				__flush_tlb_one(flush_va);
+		} else
+			leave_mm(cpu);
+	}
+	ack_APIC_irq();
+	cpu_clear(cpu, flush_cpumask);
+
+out:
+	put_cpu_no_resched();
+}
+
+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+						unsigned long va)
+{
+	cpumask_t tmp;
+	/*
+	 * A couple of (to be removed) sanity checks:
+	 *
+	 * - we do not send IPIs to not-yet booted CPUs.
+	 * - current CPU must not be in mask
+	 * - mask must exist :)
+	 */
+	BUG_ON(cpus_empty(cpumask));
+	cpus_and(tmp, cpumask, cpu_online_map);
+	BUG_ON(!cpus_equal(tmp, cpumask));
+	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+	if (!mm)
+		BUG();
+
+	/*
+	 * I'm not happy about this global shared spinlock in the
+	 * MM hot path, but we'll see how contended it is.
+	 * Temporarily this turns IRQs off, so that lockups are
+	 * detected by the NMI watchdog.
+	 */
+	spin_lock(&tlbstate_lock);
+	
+	flush_mm = mm;
+	flush_va = va;
+	cpus_or(flush_cpumask, cpumask, flush_cpumask);
+
+	/*
+	 * We have to send the IPI only to
+	 * CPUs affected.
+	 */
+	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+
+	while (!cpus_empty(flush_cpumask))
+		mb();	/* nothing. lockup detection does not belong here */;
+
+	flush_mm = NULL;
+	flush_va = 0;
+	spin_unlock(&tlbstate_lock);
+}
+	
+void flush_tlb_current_task(void)
+{
+	struct mm_struct *mm = current->mm;
+	cpumask_t cpu_mask;
+
+	preempt_disable();
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+
+	local_flush_tlb();
+	if (!cpus_empty(cpu_mask))
+		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+	preempt_enable();
+}
+
+void flush_tlb_mm (struct mm_struct * mm)
+{
+	cpumask_t cpu_mask;
+
+	preempt_disable();
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+
+	if (current->active_mm == mm) {
+		if (current->mm)
+			local_flush_tlb();
+		else
+			leave_mm(smp_processor_id());
+	}
+	if (!cpus_empty(cpu_mask))
+		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+
+	preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	cpumask_t cpu_mask;
+
+	preempt_disable();
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+
+	if (current->active_mm == mm) {
+		if(current->mm)
+			__flush_tlb_one(va);
+		 else
+		 	leave_mm(smp_processor_id());
+	}
+
+	if (!cpus_empty(cpu_mask))
+		flush_tlb_others(cpu_mask, mm, va);
+
+	preempt_enable();
+}
+
+static void do_flush_tlb_all(void* info)
+{
+	unsigned long cpu = smp_processor_id();
+
+	__flush_tlb_all();
+	if (read_pda(mmu_state) == TLBSTATE_LAZY)
+		leave_mm(cpu);
+}
+
+void flush_tlb_all(void)
+{
+	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+
+void smp_kdb_stop(void)
+{
+	send_IPI_allbutself(KDB_VECTOR);
+}
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+
+void smp_send_reschedule(int cpu)
+{
+	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+	void (*func) (void *info);
+	void *info;
+	atomic_t started;
+	atomic_t finished;
+	int wait;
+};
+
+static struct call_data_struct * call_data;
+
+/*
+ * this function sends a 'generic call function' IPI to all other CPUs
+ * in the system.
+ */
+static void __smp_call_function (void (*func) (void *info), void *info,
+				int nonatomic, int wait)
+{
+	struct call_data_struct data;
+	int cpus = num_online_cpus()-1;
+
+	if (!cpus)
+		return;
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	wmb();
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (!wait)
+		return;
+
+	while (atomic_read(&data.finished) != cpus)
+		cpu_relax();
+}
+
+/*
+ * smp_call_function - run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other
+ *        CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute func or are or have executed.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ * Actually there are a few legal cases, like panic.
+ */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			int wait)
+{
+	spin_lock(&call_lock);
+	__smp_call_function(func,info,nonatomic,wait);
+	spin_unlock(&call_lock);
+	return 0;
+}
+
+void smp_stop_cpu(void)
+{
+	/*
+	 * Remove this CPU:
+	 */
+	cpu_clear(smp_processor_id(), cpu_online_map);
+	local_irq_disable();
+	disable_local_APIC();
+	local_irq_enable(); 
+}
+
+static void smp_really_stop_cpu(void *dummy)
+{
+	smp_stop_cpu(); 
+	for (;;) 
+		asm("hlt"); 
+} 
+
+void smp_send_stop(void)
+{
+	int nolock = 0;
+	if (reboot_force)
+		return;
+	/* Don't deadlock on the call lock in panic */
+	if (!spin_trylock(&call_lock)) {
+		/* ignore locking because we have paniced anyways */
+		nolock = 1;
+	}
+	__smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
+	if (!nolock)
+		spin_unlock(&call_lock);
+
+	local_irq_disable();
+	disable_local_APIC();
+	local_irq_enable();
+}
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+asmlinkage void smp_reschedule_interrupt(void)
+{
+	ack_APIC_irq();
+}
+
+asmlinkage void smp_call_function_interrupt(void)
+{
+	void (*func) (void *info) = call_data->func;
+	void *info = call_data->info;
+	int wait = call_data->wait;
+
+	ack_APIC_irq();
+	/*
+	 * Notify initiating CPU that I've grabbed the data and am
+	 * about to execute the function
+	 */
+	mb();
+	atomic_inc(&call_data->started);
+	/*
+	 * At this point the info structure may be out of scope unless wait==1
+	 */
+	irq_enter();
+	(*func)(info);
+	irq_exit();
+	if (wait) {
+		mb();
+		atomic_inc(&call_data->finished);
+	}
+}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
new file mode 100644
index 000000000000..afd3600b919e
--- /dev/null
+++ b/arch/x86_64/kernel/smpboot.c
@@ -0,0 +1,938 @@
+/*
+ *	x86 SMP booting functions
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *	Copyright 2001 Andi Kleen, SuSE Labs.
+ *
+ *	Much of the core SMP work is based on previous work by Thomas Radke, to
+ *	whom a great many thanks are extended.
+ *
+ *	Thanks to Intel for making available several different Pentium,
+ *	Pentium Pro and Pentium-II/Xeon MP machines.
+ *	Original development of Linux SMP code supported by Caldera.
+ *
+ *	This code is released under the GNU General Public License version 2 or
+ *	later.
+ *
+ *	Fixes
+ *		Felix Koop	:	NR_CPUS used properly
+ *		Jose Renau	:	Handle single CPU case.
+ *		Alan Cox	:	By repeated request 8) - Total BogoMIP report.
+ *		Greg Wright	:	Fix for kernel stacks panic.
+ *		Erich Boleyn	:	MP v1.4 and additional changes.
+ *	Matthias Sattler	:	Changes for 2.1 kernel map.
+ *	Michel Lespinasse	:	Changes for 2.1 kernel map.
+ *	Michael Chastain	:	Change trampoline.S to gnu as.
+ *		Alan Cox	:	Dumb bug: 'B' step PPro's are fine
+ *		Ingo Molnar	:	Added APIC timers, based on code
+ *					from Jose Renau
+ *		Ingo Molnar	:	various cleanups and rewrites
+ *		Tigran Aivazian	:	fixed "0.00 in /proc/uptime on SMP" bug.
+ *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs
+ *	Andi Kleen		:	Changed for SMP boot into long mode.
+ *		Rusty Russell	:	Hacked into shape for new "hotplug" boot process. 
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+#include <linux/irq.h>
+#include <linux/bootmem.h>
+#include <linux/thread_info.h>
+#include <linux/module.h>
+
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#include <asm/kdebug.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+/* Package ID of each logical CPU */
+u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+EXPORT_SYMBOL(phys_proc_id);
+
+/* Bitmask of currently online CPUs */
+cpumask_t cpu_online_map;
+
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+static cpumask_t smp_commenced_mask;
+
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+
+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+
+/*
+ * Trampoline 80x86 program as an array.
+ */
+
+extern unsigned char trampoline_data [];
+extern unsigned char trampoline_end  [];
+
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+
+static unsigned long __init setup_trampoline(void)
+{
+	void *tramp = __va(SMP_TRAMPOLINE_BASE); 
+	memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
+	return virt_to_phys(tramp);
+}
+
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+
+static void __init smp_store_cpu_info(int id)
+{
+	struct cpuinfo_x86 *c = cpu_data + id;
+
+	*c = boot_cpu_data;
+	identify_cpu(c);
+}
+
+/*
+ * TSC synchronization.
+ *
+ * We first check whether all CPUs have their TSC's synchronized,
+ * then we print a warning if not, and always resync.
+ */
+
+static atomic_t tsc_start_flag = ATOMIC_INIT(0);
+static atomic_t tsc_count_start = ATOMIC_INIT(0);
+static atomic_t tsc_count_stop = ATOMIC_INIT(0);
+static unsigned long long tsc_values[NR_CPUS];
+
+#define NR_LOOPS 5
+
+extern unsigned int fast_gettimeoffset_quotient;
+
+static void __init synchronize_tsc_bp (void)
+{
+	int i;
+	unsigned long long t0;
+	unsigned long long sum, avg;
+	long long delta;
+	long one_usec;
+	int buggy = 0;
+
+	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus());
+
+	one_usec = cpu_khz; 
+
+	atomic_set(&tsc_start_flag, 1);
+	wmb();
+
+	/*
+	 * We loop a few times to get a primed instruction cache,
+	 * then the last pass is more or less synchronized and
+	 * the BP and APs set their cycle counters to zero all at
+	 * once. This reduces the chance of having random offsets
+	 * between the processors, and guarantees that the maximum
+	 * delay between the cycle counters is never bigger than
+	 * the latency of information-passing (cachelines) between
+	 * two CPUs.
+	 */
+	for (i = 0; i < NR_LOOPS; i++) {
+		/*
+		 * all APs synchronize but they loop on '== num_cpus'
+		 */
+		while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb();
+		atomic_set(&tsc_count_stop, 0);
+		wmb();
+		/*
+		 * this lets the APs save their current TSC:
+		 */
+		atomic_inc(&tsc_count_start);
+
+		sync_core();
+		rdtscll(tsc_values[smp_processor_id()]);
+		/*
+		 * We clear the TSC in the last loop:
+		 */
+		if (i == NR_LOOPS-1)
+			write_tsc(0, 0);
+
+		/*
+		 * Wait for all APs to leave the synchronization point:
+		 */
+		while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb();
+		atomic_set(&tsc_count_start, 0);
+		wmb();
+		atomic_inc(&tsc_count_stop);
+	}
+
+	sum = 0;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (cpu_isset(i, cpu_callout_map)) {
+		t0 = tsc_values[i];
+		sum += t0;
+	}
+	}
+	avg = sum / num_booting_cpus();
+
+	sum = 0;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_isset(i, cpu_callout_map))
+			continue;
+
+		delta = tsc_values[i] - avg;
+		if (delta < 0)
+			delta = -delta;
+		/*
+		 * We report bigger than 2 microseconds clock differences.
+		 */
+		if (delta > 2*one_usec) {
+			long realdelta;
+			if (!buggy) {
+				buggy = 1;
+				printk("\n");
+			}
+			realdelta = delta / one_usec;
+			if (tsc_values[i] < avg)
+				realdelta = -realdelta;
+
+			printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
+				i, realdelta);
+		}
+
+		sum += delta;
+	}
+	if (!buggy)
+		printk("passed.\n");
+}
+
+static void __init synchronize_tsc_ap (void)
+{
+	int i;
+
+	/*
+	 * Not every cpu is online at the time
+	 * this gets called, so we first wait for the BP to
+	 * finish SMP initialization:
+	 */
+	while (!atomic_read(&tsc_start_flag)) mb();
+
+	for (i = 0; i < NR_LOOPS; i++) {
+		atomic_inc(&tsc_count_start);
+		while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb();
+
+		sync_core();
+		rdtscll(tsc_values[smp_processor_id()]);
+		if (i == NR_LOOPS-1)
+			write_tsc(0, 0);
+
+		atomic_inc(&tsc_count_stop);
+		while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+	}
+}
+#undef NR_LOOPS
+
+static atomic_t init_deasserted;
+
+static void __init smp_callin(void)
+{
+	int cpuid, phys_id;
+	unsigned long timeout;
+
+	/*
+	 * If waken up by an INIT in an 82489DX configuration
+	 * we may get here before an INIT-deassert IPI reaches
+	 * our local APIC.  We have to wait for the IPI or we'll
+	 * lock up on an APIC access.
+	 */
+	while (!atomic_read(&init_deasserted));
+
+	/*
+	 * (This works even if the APIC is not enabled.)
+	 */
+	phys_id = GET_APIC_ID(apic_read(APIC_ID));
+	cpuid = smp_processor_id();
+	if (cpu_isset(cpuid, cpu_callin_map)) {
+		panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
+					phys_id, cpuid);
+	}
+	Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+
+	/*
+	 * STARTUP IPIs are fragile beasts as they might sometimes
+	 * trigger some glue motherboard logic. Complete APIC bus
+	 * silence for 1 second, this overestimates the time the
+	 * boot CPU is spending to send the up to 2 STARTUP IPIs
+	 * by a factor of two. This should be enough.
+	 */
+
+	/*
+	 * Waiting 2s total for startup (udelay is not yet working)
+	 */
+	timeout = jiffies + 2*HZ;
+	while (time_before(jiffies, timeout)) {
+		/*
+		 * Has the boot CPU finished it's STARTUP sequence?
+		 */
+		if (cpu_isset(cpuid, cpu_callout_map))
+			break;
+		rep_nop();
+	}
+
+	if (!time_before(jiffies, timeout)) {
+		panic("smp_callin: CPU%d started up but did not get a callout!\n",
+			cpuid);
+	}
+
+	/*
+	 * the boot CPU has finished the init stage and is spinning
+	 * on callin_map until we finish. We are free to set up this
+	 * CPU, first the APIC. (this is probably redundant on most
+	 * boards)
+	 */
+
+	Dprintk("CALLIN, before setup_local_APIC().\n");
+	setup_local_APIC();
+
+	local_irq_enable();
+
+	/*
+	 * Get our bogomips.
+	 */
+	calibrate_delay();
+	Dprintk("Stack at about %p\n",&cpuid);
+
+	disable_APIC_timer();
+
+	/*
+	 * Save our processor parameters
+	 */
+ 	smp_store_cpu_info(cpuid);
+
+	local_irq_disable();
+
+	/*
+	 * Allow the master to continue.
+	 */
+	cpu_set(cpuid, cpu_callin_map);
+
+	/*
+	 *      Synchronize the TSC with the BP
+	 */
+	if (cpu_has_tsc)
+		synchronize_tsc_ap();
+}
+
+static int cpucount;
+
+/*
+ * Activate a secondary processor.
+ */
+void __init start_secondary(void)
+{
+	/*
+	 * Dont put anything before smp_callin(), SMP
+	 * booting is too fragile that we want to limit the
+	 * things done here to the most necessary things.
+	 */
+	cpu_init();
+	smp_callin();
+
+	/* otherwise gcc will move up the smp_processor_id before the cpu_init */
+	barrier();
+
+	Dprintk("cpu %d: waiting for commence\n", smp_processor_id()); 
+	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
+		rep_nop();
+
+	Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); 	
+	setup_secondary_APIC_clock();
+
+	Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); 
+
+	if (nmi_watchdog == NMI_IO_APIC) {
+		disable_8259A_irq(0);
+		enable_NMI_through_LVT0(NULL);
+		enable_8259A_irq(0);
+	}
+
+
+	enable_APIC_timer(); 
+
+	/*
+	 * low-memory mappings have been cleared, flush them from
+	 * the local TLBs too.
+	 */
+	local_flush_tlb();
+
+	Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id()); 
+	cpu_set(smp_processor_id(), cpu_online_map);
+	wmb();
+	
+	cpu_idle();
+}
+
+extern volatile unsigned long init_rsp; 
+extern void (*initial_code)(void);
+
+#if APIC_DEBUG
+static inline void inquire_remote_apic(int apicid)
+{
+	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+	char *names[] = { "ID", "VERSION", "SPIV" };
+	int timeout, status;
+
+	printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
+
+	for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+		printk("... APIC #%d %s: ", apicid, names[i]);
+
+		/*
+		 * Wait for idle.
+		 */
+		apic_wait_icr_idle();
+
+		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+		apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+
+		timeout = 0;
+		do {
+			udelay(100);
+			status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+		} while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+
+		switch (status) {
+		case APIC_ICR_RR_VALID:
+			status = apic_read(APIC_RRR);
+			printk("%08x\n", status);
+			break;
+		default:
+			printk("failed\n");
+		}
+	}
+}
+#endif
+
+static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
+{
+	unsigned long send_status = 0, accept_status = 0;
+	int maxlvt, timeout, num_starts, j;
+
+	Dprintk("Asserting INIT.\n");
+
+	/*
+	 * Turn INIT on target chip
+	 */
+	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+	/*
+	 * Send IPI
+	 */
+	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+				| APIC_DM_INIT);
+
+	Dprintk("Waiting for send to finish...\n");
+	timeout = 0;
+	do {
+		Dprintk("+");
+		udelay(100);
+		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+	} while (send_status && (timeout++ < 1000));
+
+	mdelay(10);
+
+	Dprintk("Deasserting INIT.\n");
+
+	/* Target chip */
+	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+	/* Send IPI */
+	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+
+	Dprintk("Waiting for send to finish...\n");
+	timeout = 0;
+	do {
+		Dprintk("+");
+		udelay(100);
+		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+	} while (send_status && (timeout++ < 1000));
+
+	atomic_set(&init_deasserted, 1);
+
+	/*
+	 * Should we send STARTUP IPIs ?
+	 *
+	 * Determine this based on the APIC version.
+	 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+	 */
+	if (APIC_INTEGRATED(apic_version[phys_apicid]))
+		num_starts = 2;
+	else
+		num_starts = 0;
+
+	/*
+	 * Run STARTUP IPI loop.
+	 */
+	Dprintk("#startup loops: %d.\n", num_starts);
+
+	maxlvt = get_maxlvt();
+
+	for (j = 1; j <= num_starts; j++) {
+		Dprintk("Sending STARTUP #%d.\n",j);
+		apic_read_around(APIC_SPIV);
+		apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+		Dprintk("After apic_write.\n");
+
+		/*
+		 * STARTUP IPI
+		 */
+
+		/* Target chip */
+		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+		/* Boot on the stack */
+		/* Kick the second */
+		apic_write_around(APIC_ICR, APIC_DM_STARTUP
+					| (start_rip >> 12));
+
+		/*
+		 * Give the other CPU some time to accept the IPI.
+		 */
+		udelay(300);
+
+		Dprintk("Startup point 1.\n");
+
+		Dprintk("Waiting for send to finish...\n");
+		timeout = 0;
+		do {
+			Dprintk("+");
+			udelay(100);
+			send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+		} while (send_status && (timeout++ < 1000));
+
+		/*
+		 * Give the other CPU some time to accept the IPI.
+		 */
+		udelay(200);
+		/*
+		 * Due to the Pentium erratum 3AP.
+		 */
+		if (maxlvt > 3) {
+			apic_read_around(APIC_SPIV);
+			apic_write(APIC_ESR, 0);
+		}
+		accept_status = (apic_read(APIC_ESR) & 0xEF);
+		if (send_status || accept_status)
+			break;
+	}
+	Dprintk("After Startup.\n");
+
+	if (send_status)
+		printk(KERN_ERR "APIC never delivered???\n");
+	if (accept_status)
+		printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+
+	return (send_status | accept_status);
+}
+
+static void __init do_boot_cpu (int apicid)
+{
+	struct task_struct *idle;
+	unsigned long boot_error;
+	int timeout, cpu;
+	unsigned long start_rip;
+
+	cpu = ++cpucount;
+	/*
+	 * We can't use kernel_thread since we must avoid to
+	 * reschedule the child.
+	 */
+	idle = fork_idle(cpu);
+	if (IS_ERR(idle))
+		panic("failed fork for CPU %d", cpu);
+	x86_cpu_to_apicid[cpu] = apicid;
+
+	cpu_pda[cpu].pcurrent = idle;
+
+	start_rip = setup_trampoline();
+
+	init_rsp = idle->thread.rsp; 
+	per_cpu(init_tss,cpu).rsp0 = init_rsp;
+	initial_code = start_secondary;
+	clear_ti_thread_flag(idle->thread_info, TIF_FORK);
+
+	printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, 
+	       start_rip, init_rsp);
+
+	/*
+	 * This grunge runs the startup process for
+	 * the targeted processor.
+	 */
+
+	atomic_set(&init_deasserted, 0);
+
+	Dprintk("Setting warm reset code and vector.\n");
+
+	CMOS_WRITE(0xa, 0xf);
+	local_flush_tlb();
+	Dprintk("1.\n");
+	*((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
+	Dprintk("2.\n");
+	*((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
+	Dprintk("3.\n");
+
+	/*
+	 * Be paranoid about clearing APIC errors.
+	 */
+	if (APIC_INTEGRATED(apic_version[apicid])) {
+		apic_read_around(APIC_SPIV);
+		apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+	}
+
+	/*
+	 * Status is now clean
+	 */
+	boot_error = 0;
+
+	/*
+	 * Starting actual IPI sequence...
+	 */
+	boot_error = wakeup_secondary_via_INIT(apicid, start_rip); 
+
+	if (!boot_error) {
+		/*
+		 * allow APs to start initializing.
+		 */
+		Dprintk("Before Callout %d.\n", cpu);
+		cpu_set(cpu, cpu_callout_map);
+		Dprintk("After Callout %d.\n", cpu);
+
+		/*
+		 * Wait 5s total for a response
+		 */
+		for (timeout = 0; timeout < 50000; timeout++) {
+			if (cpu_isset(cpu, cpu_callin_map))
+				break;	/* It has booted */
+			udelay(100);
+		}
+
+		if (cpu_isset(cpu, cpu_callin_map)) {
+			/* number CPUs logically, starting from 1 (BSP is 0) */
+			Dprintk("OK.\n");
+			print_cpu_info(&cpu_data[cpu]);
+			Dprintk("CPU has booted.\n");
+		} else {
+			boot_error = 1;
+			if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
+					== 0xA5)
+				/* trampoline started but...? */
+				printk("Stuck ??\n");
+			else
+				/* trampoline code not run */
+				printk("Not responding.\n");
+#if APIC_DEBUG
+			inquire_remote_apic(apicid);
+#endif
+		}
+	}
+	if (boot_error) {
+		cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+		clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+		cpucount--;
+		x86_cpu_to_apicid[cpu] = BAD_APICID;
+		x86_cpu_to_log_apicid[cpu] = BAD_APICID;
+	}
+}
+
+static void smp_tune_scheduling (void)
+{
+	int cachesize;       /* kB   */
+	unsigned long bandwidth = 1000; /* MB/s */
+	/*
+	 * Rough estimation for SMP scheduling, this is the number of
+	 * cycles it takes for a fully memory-limited process to flush
+	 * the SMP-local cache.
+	 *
+	 * (For a P5 this pretty much means we will choose another idle
+	 *  CPU almost always at wakeup time (this is due to the small
+	 *  L1 cache), on PIIs it's around 50-100 usecs, depending on
+	 *  the cache size)
+	 */
+
+	if (!cpu_khz) {
+		return;
+	} else {
+		cachesize = boot_cpu_data.x86_cache_size;
+		if (cachesize == -1) {
+			cachesize = 16; /* Pentiums, 2x8kB cache */
+			bandwidth = 100;
+		}
+	}
+}
+
+/*
+ * Cycle through the processors sending APIC IPIs to boot each.
+ */
+
+static void __init smp_boot_cpus(unsigned int max_cpus)
+{
+	unsigned apicid, cpu, bit, kicked;
+
+	nmi_watchdog_default();
+
+	/*
+	 * Setup boot CPU information
+	 */
+	smp_store_cpu_info(0); /* Final full version of the data */
+	printk(KERN_INFO "CPU%d: ", 0);
+	print_cpu_info(&cpu_data[0]);
+
+	current_thread_info()->cpu = 0;
+	smp_tune_scheduling();
+
+	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
+		printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+		       hard_smp_processor_id());
+		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+	}
+
+	/*
+	 * If we couldn't find an SMP configuration at boot time,
+	 * get out of here now!
+	 */
+	if (!smp_found_config) {
+		printk(KERN_NOTICE "SMP motherboard not detected.\n");
+		io_apic_irqs = 0;
+		cpu_online_map = cpumask_of_cpu(0);
+		cpu_set(0, cpu_sibling_map[0]);
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		if (APIC_init_uniprocessor())
+			printk(KERN_NOTICE "Local APIC not detected."
+					   " Using dummy APIC emulation.\n");
+		goto smp_done;
+	}
+
+	/*
+	 * Should not be necessary because the MP table should list the boot
+	 * CPU too, but we do it for the sake of robustness anyway.
+	 */
+	if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
+		printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
+								 boot_cpu_id);
+		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+	}
+
+	/*
+	 * If we couldn't find a local APIC, then get out of here now!
+	 */
+	if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
+		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+			boot_cpu_id);
+		printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+		io_apic_irqs = 0;
+		cpu_online_map = cpumask_of_cpu(0);
+		cpu_set(0, cpu_sibling_map[0]);
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		disable_apic = 1;
+		goto smp_done;
+	}
+
+	verify_local_APIC();
+
+	/*
+	 * If SMP should be disabled, then really disable it!
+	 */
+	if (!max_cpus) {
+		smp_found_config = 0;
+		printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+		io_apic_irqs = 0;
+		cpu_online_map = cpumask_of_cpu(0);
+		cpu_set(0, cpu_sibling_map[0]);
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		disable_apic = 1;
+		goto smp_done;
+	}
+
+	connect_bsp_APIC();
+	setup_local_APIC();
+
+	if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id)
+		BUG();
+
+	x86_cpu_to_apicid[0] = boot_cpu_id;
+
+	/*
+	 * Now scan the CPU present map and fire up the other CPUs.
+	 */
+	Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+
+	kicked = 1;
+	for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
+		apicid = cpu_present_to_apicid(bit);
+		/*
+		 * Don't even attempt to start the boot CPU!
+		 */
+		if (apicid == boot_cpu_id || (apicid == BAD_APICID))
+			continue;
+
+		if (!physid_isset(apicid, phys_cpu_present_map))
+			continue;
+		if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
+			continue;
+
+		do_boot_cpu(apicid);
+		++kicked;
+	}
+
+	/*
+	 * Cleanup possible dangling ends...
+	 */
+	{
+		/*
+		 * Install writable page 0 entry to set BIOS data area.
+		 */
+		local_flush_tlb();
+
+		/*
+		 * Paranoid:  Set warm reset code and vector here back
+		 * to default values.
+		 */
+		CMOS_WRITE(0, 0xf);
+
+		*((volatile int *) phys_to_virt(0x467)) = 0;
+	}
+
+	/*
+	 * Allow the user to impress friends.
+	 */
+
+	Dprintk("Before bogomips.\n");
+	if (!cpucount) {
+		printk(KERN_INFO "Only one processor found.\n");
+	} else {
+		unsigned long bogosum = 0;
+		for (cpu = 0; cpu < NR_CPUS; cpu++)
+			if (cpu_isset(cpu, cpu_callout_map))
+				bogosum += cpu_data[cpu].loops_per_jiffy;
+		printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+			cpucount+1,
+			bogosum/(500000/HZ),
+			(bogosum/(5000/HZ))%100);
+		Dprintk("Before bogocount - setting activated=1.\n");
+	}
+
+	/*
+	 * Construct cpu_sibling_map[], so that we can tell the
+	 * sibling CPU efficiently.
+	 */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpus_clear(cpu_sibling_map[cpu]);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		int siblings = 0;
+		int i;
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
+
+		if (smp_num_siblings > 1) {
+			for (i = 0; i < NR_CPUS; i++) {
+				if (!cpu_isset(i, cpu_callout_map))
+					continue;
+				if (phys_proc_id[cpu] == phys_proc_id[i]) {
+					siblings++;
+					cpu_set(i, cpu_sibling_map[cpu]);
+				}
+			}
+		} else { 
+			siblings++;
+			cpu_set(cpu, cpu_sibling_map[cpu]);
+		}
+
+		if (siblings != smp_num_siblings) {
+			printk(KERN_WARNING 
+	       "WARNING: %d siblings found for CPU%d, should be %d\n", 
+			       siblings, cpu, smp_num_siblings);
+			smp_num_siblings = siblings;
+		}       
+	}
+
+	Dprintk("Boot done.\n");
+
+	/*
+	 * Here we can be sure that there is an IO-APIC in the system. Let's
+	 * go and set it up:
+	 */
+	if (!skip_ioapic_setup && nr_ioapics)
+		setup_IO_APIC();
+	else
+		nr_ioapics = 0;
+
+	setup_boot_APIC_clock();
+
+	/*
+	 * Synchronize the TSC with the AP
+	 */
+	if (cpu_has_tsc && cpucount)
+		synchronize_tsc_bp();
+
+ smp_done:
+	time_init_smp();
+}
+
+/* These are wrappers to interface to the new boot process.  Someone
+   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+	smp_boot_cpus(max_cpus);
+}
+
+void __devinit smp_prepare_boot_cpu(void)
+{
+	cpu_set(smp_processor_id(), cpu_online_map);
+	cpu_set(smp_processor_id(), cpu_callout_map);
+}
+
+int __devinit __cpu_up(unsigned int cpu)
+{
+	/* This only works at boot for x86.  See "rewrite" above. */
+	if (cpu_isset(cpu, smp_commenced_mask)) {
+		local_irq_enable();
+		return -ENOSYS;
+	}
+
+	/* In case one didn't come up */
+	if (!cpu_isset(cpu, cpu_callin_map)) {
+		local_irq_enable();
+		return -EIO;
+	}
+	local_irq_enable();
+
+	/* Unleash the CPU! */
+	Dprintk("waiting for cpu %d\n", cpu);
+
+	cpu_set(cpu, smp_commenced_mask);
+	while (!cpu_isset(cpu, cpu_online_map))
+		mb();
+	return 0;
+}
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_IO_APIC
+	setup_ioapic_dest();
+#endif
+	zap_low_mappings();
+}
+
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
new file mode 100644
index 000000000000..ebaa1e37d657
--- /dev/null
+++ b/arch/x86_64/kernel/suspend.c
@@ -0,0 +1,157 @@
+/*
+ * Suspend support specific for i386.
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/sysrq.h>
+#include <linux/proc_fs.h>
+#include <linux/irq.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/suspend.h>
+#include <asm/uaccess.h>
+#include <asm/acpi.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+
+struct saved_context saved_context;
+
+unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
+unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
+unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
+unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
+unsigned long saved_context_eflags;
+
+void __save_processor_state(struct saved_context *ctxt)
+{
+	kernel_fpu_begin();
+
+	/*
+	 * descriptor tables
+	 */
+	asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
+	asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
+	asm volatile ("sldt %0" : "=m" (ctxt->ldt));
+	asm volatile ("str %0"  : "=m" (ctxt->tr));
+
+	/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
+	/* EFER should be constant for kernel version, no need to handle it. */
+	/*
+	 * segment registers
+	 */
+	asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
+	asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
+	asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
+	asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
+	asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
+
+	rdmsrl(MSR_FS_BASE, ctxt->fs_base);
+	rdmsrl(MSR_GS_BASE, ctxt->gs_base);
+	rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+
+	/*
+	 * control registers 
+	 */
+	asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0));
+	asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
+	asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
+	asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
+}
+
+void save_processor_state(void)
+{
+	__save_processor_state(&saved_context);
+}
+
+static void
+do_fpu_end(void)
+{
+        /* restore FPU regs if necessary */
+	/* Do it out of line so that gcc does not move cr0 load to some stupid place */
+        kernel_fpu_end();
+	mxcsr_feature_mask_init();
+}
+
+void __restore_processor_state(struct saved_context *ctxt)
+{
+	/*
+	 * control registers
+	 */
+	asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
+	asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
+	asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
+	asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
+
+	/*
+	 * segment registers
+	 */
+	asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
+	asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
+	asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
+	load_gs_index(ctxt->gs);
+	asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
+
+	wrmsrl(MSR_FS_BASE, ctxt->fs_base);
+	wrmsrl(MSR_GS_BASE, ctxt->gs_base);
+	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+
+	/*
+	 * now restore the descriptor tables to their proper values
+	 * ltr is done i fix_processor_context().
+	 */
+	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+	asm volatile ("lldt %0" :: "m" (ctxt->ldt));
+
+	fix_processor_context();
+
+	do_fpu_end();
+}
+
+void restore_processor_state(void)
+{
+	__restore_processor_state(&saved_context);
+}
+
+void fix_processor_context(void)
+{
+	int cpu = smp_processor_id();
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
+
+	set_tss_desc(cpu,t);	/* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
+
+	cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9;
+
+	syscall_init();                         /* This sets MSR_*STAR and related */
+	load_TR_desc();				/* This does ltr */
+	load_LDT(&current->active_mm->context);	/* This does lldt */
+
+	/*
+	 * Now maybe reload the debug registers
+	 */
+	if (current->thread.debugreg7){
+                loaddebug(&current->thread, 0);
+                loaddebug(&current->thread, 1);
+                loaddebug(&current->thread, 2);
+                loaddebug(&current->thread, 3);
+                /* no 4 and 5 */
+                loaddebug(&current->thread, 6);
+                loaddebug(&current->thread, 7);
+	}
+
+}
+
+
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
new file mode 100644
index 000000000000..53f8e1659511
--- /dev/null
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -0,0 +1,104 @@
+/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * Distribute under GPLv2.
+ *
+ * swsusp_arch_resume may not use any stack, nor any variable that is
+ * not "NoSave" during copying pages:
+ *
+ * Its rewriting one kernel image with another. What is stack in "old"
+ * image could very well be data page in "new" image, and overwriting
+ * your own stack under you is bad idea.
+ */
+	
+	.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/offset.h>
+
+ENTRY(swsusp_arch_suspend)
+
+	movq %rsp, saved_context_esp(%rip)
+	movq %rax, saved_context_eax(%rip)
+	movq %rbx, saved_context_ebx(%rip)
+	movq %rcx, saved_context_ecx(%rip)
+	movq %rdx, saved_context_edx(%rip)
+	movq %rbp, saved_context_ebp(%rip)
+	movq %rsi, saved_context_esi(%rip)
+	movq %rdi, saved_context_edi(%rip)
+	movq %r8,  saved_context_r08(%rip)
+	movq %r9,  saved_context_r09(%rip)
+	movq %r10, saved_context_r10(%rip)
+	movq %r11, saved_context_r11(%rip)
+	movq %r12, saved_context_r12(%rip)
+	movq %r13, saved_context_r13(%rip)
+	movq %r14, saved_context_r14(%rip)
+	movq %r15, saved_context_r15(%rip)
+	pushfq ; popq saved_context_eflags(%rip)
+
+	call swsusp_save
+	ret
+
+ENTRY(swsusp_arch_resume)
+	/* set up cr3 */	
+	leaq	init_level4_pgt(%rip),%rax
+	subq	$__START_KERNEL_map,%rax
+	movq	%rax,%cr3
+
+	movq	mmu_cr4_features(%rip), %rax
+	movq	%rax, %rdx
+	andq	$~(1<<7), %rdx	# PGE
+	movq	%rdx, %cr4;  # turn off PGE
+	movq	%cr3, %rcx;  # flush TLB
+	movq	%rcx, %cr3;
+	movq	%rax, %cr4;  # turn PGE back on
+
+	movq	pagedir_nosave(%rip), %rdx
+loop:
+	testq	%rdx, %rdx
+	jz	done
+
+	/* get addresses from the pbe and copy the page */
+	movq	pbe_address(%rdx), %rsi
+	movq	pbe_orig_address(%rdx), %rdi
+	movq	$512, %rcx
+	rep
+	movsq
+
+	/* progress to the next pbe */
+	movq	pbe_next(%rdx), %rdx
+	jmp	loop
+done:
+	/* Flush TLB, including "global" things (vmalloc) */
+	movq	mmu_cr4_features(%rip), %rax
+	movq	%rax, %rdx
+	andq	$~(1<<7), %rdx;  # PGE
+	movq	%rdx, %cr4;  # turn off PGE
+	movq	%cr3, %rcx;  # flush TLB
+	movq	%rcx, %cr3
+	movq	%rax, %cr4;  # turn PGE back on
+
+	movl	$24, %eax
+	movl	%eax, %ds
+
+	movq saved_context_esp(%rip), %rsp
+	movq saved_context_ebp(%rip), %rbp
+	/* Don't restore %rax, it must be 0 anyway */
+	movq saved_context_ebx(%rip), %rbx
+	movq saved_context_ecx(%rip), %rcx
+	movq saved_context_edx(%rip), %rdx
+	movq saved_context_esi(%rip), %rsi
+	movq saved_context_edi(%rip), %rdi
+	movq saved_context_r08(%rip), %r8
+	movq saved_context_r09(%rip), %r9
+	movq saved_context_r10(%rip), %r10
+	movq saved_context_r11(%rip), %r11
+	movq saved_context_r12(%rip), %r12
+	movq saved_context_r13(%rip), %r13
+	movq saved_context_r14(%rip), %r14
+	movq saved_context_r15(%rip), %r15
+	pushq saved_context_eflags(%rip) ; popfq
+
+	xorq	%rax, %rax
+
+	ret
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
new file mode 100644
index 000000000000..477d8be57d64
--- /dev/null
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -0,0 +1,173 @@
+/*
+ * linux/arch/x86_64/kernel/sys_x86_64.c
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/personality.h>
+
+#include <asm/uaccess.h>
+#include <asm/ia32.h>
+
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+asmlinkage long sys_pipe(int __user *fildes)
+{
+	int fd[2];
+	int error;
+
+	error = do_pipe(fd);
+	if (!error) {
+		if (copy_to_user(fildes, fd, 2*sizeof(int)))
+			error = -EFAULT;
+	}
+	return error;
+}
+
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
+	unsigned long fd, unsigned long off)
+{
+	long error;
+	struct file * file;
+
+	error = -EINVAL;
+	if (off & ~PAGE_MASK)
+		goto out;
+
+	error = -EBADF;
+	file = NULL;
+	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	if (!(flags & MAP_ANONYMOUS)) {
+		file = fget(fd);
+		if (!file)
+			goto out;
+	}
+	down_write(&current->mm->mmap_sem);
+	error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
+	up_write(&current->mm->mmap_sem);
+
+	if (file)
+		fput(file);
+out:
+	return error;
+}
+
+static void find_start_end(unsigned long flags, unsigned long *begin,
+			   unsigned long *end)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_thread_flag(TIF_IA32)) { 
+		*begin = TASK_UNMAPPED_32;
+		*end = IA32_PAGE_OFFSET; 
+	} else 
+#endif
+	if (flags & MAP_32BIT) { 
+		/* This is usually used needed to map code in small
+		   model, so it needs to be in the first 31bit. Limit
+		   it to that.  This means we need to move the
+		   unmapped base down for this case. This can give
+		   conflicts with the heap, but we assume that glibc
+		   malloc knows how to fall back to mmap. Give it 1GB
+		   of playground for now. -AK */ 
+		*begin = 0x40000000; 
+		*end = 0x80000000;		
+	} else { 
+		*begin = TASK_UNMAPPED_64; 
+		*end = TASK_SIZE; 
+		}
+} 
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start_addr;
+	unsigned long begin, end;
+	
+	find_start_end(flags, &begin, &end); 
+
+	if (len > end)
+		return -ENOMEM;
+
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (end - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+	addr = mm->free_area_cache;
+	if (addr < begin) 
+		addr = begin; 
+	start_addr = addr;
+
+full_search:
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (end - len < addr) {
+			/*
+			 * Start a new search - just in case we missed
+			 * some holes.
+			 */
+			if (start_addr != begin) {
+				start_addr = addr = begin;
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+		if (!vma || addr + len <= vma->vm_start) {
+			/*
+			 * Remember the place where we stopped the search:
+			 */
+			mm->free_area_cache = addr + len;
+			return addr;
+		}
+		addr = vma->vm_end;
+	}
+}
+
+asmlinkage long sys_uname(struct new_utsname __user * name)
+{
+	int err;
+	down_read(&uts_sem);
+	err = copy_to_user(name, &system_utsname, sizeof (*name));
+	up_read(&uts_sem);
+	if (personality(current->personality) == PER_LINUX32) 
+		err |= copy_to_user(&name->machine, "i686", 5); 		
+	return err ? -EFAULT : 0;
+}
+
+asmlinkage long wrap_sys_shmat(int shmid, char __user *shmaddr, int shmflg)
+{
+	unsigned long raddr;
+	return do_shmat(shmid,shmaddr,shmflg,&raddr) ?: (long)raddr;
+}
+
+asmlinkage long sys_time64(long __user * tloc)
+{
+	struct timeval now; 
+	int i; 
+
+	do_gettimeofday(&now);
+	i = now.tv_sec;
+	if (tloc) {
+		if (put_user(i,tloc))
+			i = -EFAULT;
+	}
+	return i;
+}
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
new file mode 100644
index 000000000000..e263685f864c
--- /dev/null
+++ b/arch/x86_64/kernel/syscall.c
@@ -0,0 +1,26 @@
+/* System call table for x86-64. */ 
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <linux/config.h>
+
+#define __NO_STUBS
+
+#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 
+#undef _ASM_X86_64_UNISTD_H_
+#include <asm-x86_64/unistd.h>
+
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) [ nr ] = sym, 
+#undef _ASM_X86_64_UNISTD_H_
+
+typedef void (*sys_call_ptr_t)(void); 
+
+extern void sys_ni_syscall(void);
+
+sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { 
+	/* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm-x86_64/unistd.h>
+};
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
new file mode 100644
index 000000000000..3bafe438fa75
--- /dev/null
+++ b/arch/x86_64/kernel/time.c
@@ -0,0 +1,1262 @@
+/*
+ *  linux/arch/x86-64/kernel/time.c
+ *
+ *  "High Precision Event Timer" based timekeeping.
+ *
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
+ *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/irq.h>
+#include <linux/time.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/kallsyms.h>
+#include <asm/8253pit.h>
+#include <asm/pgtable.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/proto.h>
+#include <asm/hpet.h>
+#include <asm/sections.h>
+#include <linux/cpufreq.h>
+#include <linux/hpet.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/apic.h>
+#endif
+
+u64 jiffies_64 = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
+#ifdef CONFIG_CPU_FREQ
+static void cpufreq_delayed_get(void);
+#endif
+extern void i8254_timer_resume(void);
+extern int using_apic_timer;
+
+DEFINE_SPINLOCK(rtc_lock);
+DEFINE_SPINLOCK(i8253_lock);
+
+static int nohpet __initdata = 0;
+static int notsc __initdata = 0;
+
+#undef HPET_HACK_ENABLE_DANGEROUS
+
+unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
+static unsigned long hpet_period;			/* fsecs / HPET clock */
+unsigned long hpet_tick;				/* HPET clocks / interrupt */
+unsigned long vxtime_hz = PIT_TICK_RATE;
+int report_lost_ticks;				/* command line option */
+unsigned long long monotonic_base;
+
+struct vxtime_data __vxtime __section_vxtime;	/* for vsyscalls */
+
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
+struct timespec __xtime __section_xtime;
+struct timezone __sys_tz __section_sys_tz;
+
+static inline void rdtscll_sync(unsigned long *tsc)
+{
+#ifdef CONFIG_SMP
+	sync_core();
+#endif
+	rdtscll(*tsc);
+}
+
+/*
+ * do_gettimeoffset() returns microseconds since last timer interrupt was
+ * triggered by hardware. A memory read of HPET is slower than a register read
+ * of TSC, but much more reliable. It's also synchronized to the timer
+ * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
+ * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
+ * This is not a problem, because jiffies hasn't updated either. They are bound
+ * together by xtime_lock.
+ */
+
+static inline unsigned int do_gettimeoffset_tsc(void)
+{
+	unsigned long t;
+	unsigned long x;
+	rdtscll_sync(&t);
+	if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
+	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
+	return x;
+}
+
+static inline unsigned int do_gettimeoffset_hpet(void)
+{
+	return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
+}
+
+unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
+
+/*
+ * This version of gettimeofday() has microsecond resolution and better than
+ * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
+ * MHz) HPET timer.
+ */
+
+void do_gettimeofday(struct timeval *tv)
+{
+	unsigned long seq, t;
+ 	unsigned int sec, usec;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		sec = xtime.tv_sec;
+		usec = xtime.tv_nsec / 1000;
+
+		/* i386 does some correction here to keep the clock 
+		   monotonous even when ntpd is fixing drift.
+		   But they didn't work for me, there is a non monotonic
+		   clock anyways with ntp.
+		   I dropped all corrections now until a real solution can
+		   be found. Note when you fix it here you need to do the same
+		   in arch/x86_64/kernel/vsyscall.c and export all needed
+		   variables in vmlinux.lds. -AK */ 
+
+		t = (jiffies - wall_jiffies) * (1000000L / HZ) +
+			do_gettimeoffset();
+		usec += t;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	tv->tv_sec = sec + usec / 1000000;
+	tv->tv_usec = usec % 1000000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+/*
+ * settimeofday() first undoes the correction that gettimeofday would do
+ * on the time, and then saves it. This is ugly, but has been like this for
+ * ages already.
+ */
+
+int do_settimeofday(struct timespec *tv)
+{
+	time_t wtm_sec, sec = tv->tv_sec;
+	long wtm_nsec, nsec = tv->tv_nsec;
+
+	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irq(&xtime_lock);
+
+	nsec -= do_gettimeoffset() * 1000 +
+		(jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
+
+	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+	set_normalized_timespec(&xtime, sec, nsec);
+	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+	time_adjust = 0;		/* stop active adjtime() */
+	time_status |= STA_UNSYNC;
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_esterror = NTP_PHASE_LIMIT;
+
+	write_sequnlock_irq(&xtime_lock);
+	clock_was_set();
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	/* Assume the lock function has either no stack frame or only a single word.
+	   This checks if the address on the stack looks like a kernel text address.
+	   There is a small window for false hits, but in that case the tick
+	   is just accounted to the spinlock function.
+	   Better would be to write these functions in assembler again
+	   and check exactly. */
+	if (in_lock_functions(pc)) {
+		char *v = *(char **)regs->rsp;
+		if ((v >= _stext && v <= _etext) ||
+			(v >= _sinittext && v <= _einittext) ||
+			(v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
+			return (unsigned long)v;
+		return ((unsigned long *)regs->rsp)[1];
+	}
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
+ * ms after the second nowtime has started, because when nowtime is written
+ * into the registers of the CMOS clock, it will jump to the next second
+ * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
+ * sheet for details.
+ */
+
+static void set_rtc_mmss(unsigned long nowtime)
+{
+	int real_seconds, real_minutes, cmos_minutes;
+	unsigned char control, freq_select;
+
+/*
+ * IRQs are disabled when we're called from the timer interrupt,
+ * no need for spin_lock_irqsave()
+ */
+
+	spin_lock(&rtc_lock);
+
+/*
+ * Tell the clock it's being set and stop it.
+ */
+
+	control = CMOS_READ(RTC_CONTROL);
+	CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
+
+	freq_select = CMOS_READ(RTC_FREQ_SELECT);
+	CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
+
+	cmos_minutes = CMOS_READ(RTC_MINUTES);
+		BCD_TO_BIN(cmos_minutes);
+
+/*
+ * since we're only adjusting minutes and seconds, don't interfere with hour
+ * overflow. This avoids messing with unknown time zones but requires your RTC
+ * not to be off by more than 15 minutes. Since we're calling it only when
+ * our clock is externally synchronized using NTP, this shouldn't be a problem.
+ */
+
+	real_seconds = nowtime % 60;
+	real_minutes = nowtime / 60;
+	if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
+		real_minutes += 30;		/* correct for half hour time zone */
+	real_minutes %= 60;
+
+#if 0
+	/* AMD 8111 is a really bad time keeper and hits this regularly. 
+	   It probably was an attempt to avoid screwing up DST, but ignore
+	   that for now. */	   
+	if (abs(real_minutes - cmos_minutes) >= 30) {
+		printk(KERN_WARNING "time.c: can't update CMOS clock "
+		       "from %d to %d\n", cmos_minutes, real_minutes);
+	} else
+#endif
+
+	{
+			BIN_TO_BCD(real_seconds);
+			BIN_TO_BCD(real_minutes);
+		CMOS_WRITE(real_seconds, RTC_SECONDS);
+		CMOS_WRITE(real_minutes, RTC_MINUTES);
+	}
+
+/*
+ * The following flags have to be released exactly in this order, otherwise the
+ * DS12887 (popular MC146818A clone with integrated battery and quartz) will
+ * not reset the oscillator and will not update precisely 500 ms later. You
+ * won't find this mentioned in the Dallas Semiconductor data sheets, but who
+ * believes data sheets anyway ... -- Markus Kuhn
+ */
+
+	CMOS_WRITE(control, RTC_CONTROL);
+	CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
+
+	spin_unlock(&rtc_lock);
+}
+
+
+/* monotonic_clock(): returns # of nanoseconds passed since time_init()
+ *		Note: This function is required to return accurate
+ *		time even in the absence of multiple timer ticks.
+ */
+unsigned long long monotonic_clock(void)
+{
+	unsigned long seq;
+ 	u32 last_offset, this_offset, offset;
+	unsigned long long base;
+
+	if (vxtime.mode == VXTIME_HPET) {
+		do {
+			seq = read_seqbegin(&xtime_lock);
+
+			last_offset = vxtime.last;
+			base = monotonic_base;
+			this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
+
+		} while (read_seqretry(&xtime_lock, seq));
+		offset = (this_offset - last_offset);
+		offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
+		return base + offset;
+	}else{
+		do {
+			seq = read_seqbegin(&xtime_lock);
+
+			last_offset = vxtime.last_tsc;
+			base = monotonic_base;
+		} while (read_seqretry(&xtime_lock, seq));
+		sync_core();
+		rdtscll(this_offset);
+		offset = (this_offset - last_offset)*1000/cpu_khz; 
+		return base + offset;
+	}
+
+
+}
+EXPORT_SYMBOL(monotonic_clock);
+
+static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
+{
+    static long lost_count;
+    static int warned;
+
+    if (report_lost_ticks) {
+	    printk(KERN_WARNING "time.c: Lost %d timer "
+		   "tick(s)! ", lost);
+	    print_symbol("rip %s)\n", regs->rip);
+    }
+
+    if (lost_count == 1000 && !warned) {
+	    printk(KERN_WARNING
+		   "warning: many lost ticks.\n"
+		   KERN_WARNING "Your time source seems to be instable or "
+		   		"some driver is hogging interupts\n");
+	    print_symbol("rip %s\n", regs->rip);
+	    if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
+		    printk(KERN_WARNING "Falling back to HPET\n");
+		    vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+		    vxtime.mode = VXTIME_HPET;
+		    do_gettimeoffset = do_gettimeoffset_hpet;
+	    }
+	    /* else should fall back to PIT, but code missing. */
+	    warned = 1;
+    } else
+	    lost_count++;
+
+#ifdef CONFIG_CPU_FREQ
+    /* In some cases the CPU can change frequency without us noticing
+       (like going into thermal throttle)
+       Give cpufreq a change to catch up. */
+    if ((lost_count+1) % 25 == 0) {
+	    cpufreq_delayed_get();
+    }
+#endif
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	static unsigned long rtc_update = 0;
+	unsigned long tsc;
+	int delay, offset = 0, lost = 0;
+
+/*
+ * Here we are in the timer irq handler. We have irqs locally disabled (so we
+ * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
+ * on the other CPU, so we need a lock. We also need to lock the vsyscall
+ * variables, because both do_timer() and us change them -arca+vojtech
+ */
+
+	write_seqlock(&xtime_lock);
+
+	if (vxtime.hpet_address) {
+		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
+		delay = hpet_readl(HPET_COUNTER) - offset;
+	} else {
+		spin_lock(&i8253_lock);
+		outb_p(0x00, 0x43);
+		delay = inb_p(0x40);
+		delay |= inb(0x40) << 8;
+		spin_unlock(&i8253_lock);
+		delay = LATCH - 1 - delay;
+	}
+
+	rdtscll_sync(&tsc);
+
+	if (vxtime.mode == VXTIME_HPET) {
+		if (offset - vxtime.last > hpet_tick) {
+			lost = (offset - vxtime.last) / hpet_tick - 1;
+		}
+
+		monotonic_base += 
+			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
+
+		vxtime.last = offset;
+	} else {
+		offset = (((tsc - vxtime.last_tsc) *
+			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
+
+		if (offset < 0)
+			offset = 0;
+
+		if (offset > (USEC_PER_SEC / HZ)) {
+			lost = offset / (USEC_PER_SEC / HZ);
+			offset %= (USEC_PER_SEC / HZ);
+		}
+
+		monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
+
+		vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
+
+		if ((((tsc - vxtime.last_tsc) *
+		      vxtime.tsc_quot) >> 32) < offset)
+			vxtime.last_tsc = tsc -
+				(((long) offset << 32) / vxtime.tsc_quot) - 1;
+	}
+
+	if (lost > 0) {
+		handle_lost_ticks(lost, regs);
+		jiffies += lost;
+	}
+
+/*
+ * Do the timer stuff.
+ */
+
+	do_timer(regs);
+#ifndef CONFIG_SMP
+	update_process_times(user_mode(regs));
+#endif
+
+/*
+ * In the SMP case we use the local APIC timer interrupt to do the profiling,
+ * except when we simulate SMP mode on a uniprocessor system, in that case we
+ * have to call the local interrupt handler.
+ */
+
+#ifndef CONFIG_X86_LOCAL_APIC
+	profile_tick(CPU_PROFILING, regs);
+#else
+	if (!using_apic_timer)
+		smp_local_timer_interrupt(regs);
+#endif
+
+/*
+ * If we have an externally synchronized Linux clock, then update CMOS clock
+ * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
+ * closest to exactly 500 ms before the next second. If the update fails, we
+ * don't care, as it'll be updated on the next turn, and the problem (time way
+ * off) isn't likely to go away much sooner anyway.
+ */
+
+	if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
+		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
+		set_rtc_mmss(xtime.tv_sec);
+		rtc_update = xtime.tv_sec + 660;
+	}
+ 
+	write_sequnlock(&xtime_lock);
+
+	return IRQ_HANDLED;
+}
+
+static unsigned int cyc2ns_scale;
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
+{
+	cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
+
+unsigned long long sched_clock(void)
+{
+	unsigned long a = 0;
+
+#if 0
+	/* Don't do a HPET read here. Using TSC always is much faster
+	   and HPET may not be mapped yet when the scheduler first runs.
+           Disadvantage is a small drift between CPUs in some configurations,
+	   but that should be tolerable. */
+	if (__vxtime.mode == VXTIME_HPET)
+		return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
+#endif
+
+	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
+	   which means it is not completely exact and may not be monotonous between
+	   CPUs. But the errors should be too small to matter for scheduling
+	   purposes. */
+
+	rdtscll(a);
+	return cycles_2_ns(a);
+}
+
+unsigned long get_cmos_time(void)
+{
+	unsigned int timeout, year, mon, day, hour, min, sec;
+	unsigned char last, this;
+	unsigned long flags;
+
+/*
+ * The Linux interpretation of the CMOS clock register contents: When the
+ * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
+ * second which has precisely just started. Waiting for this can take up to 1
+ * second, we timeout approximately after 2.4 seconds on a machine with
+ * standard 8.3 MHz ISA bus.
+ */
+
+	spin_lock_irqsave(&rtc_lock, flags);
+
+	timeout = 1000000;
+	last = this = 0;
+
+	while (timeout && last && !this) {
+		last = this;
+		this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
+		timeout--;
+	}
+
+/*
+ * Here we are safe to assume the registers won't change for a whole second, so
+ * we just go ahead and read them.
+	 */
+
+		sec = CMOS_READ(RTC_SECONDS);
+		min = CMOS_READ(RTC_MINUTES);
+		hour = CMOS_READ(RTC_HOURS);
+		day = CMOS_READ(RTC_DAY_OF_MONTH);
+		mon = CMOS_READ(RTC_MONTH);
+		year = CMOS_READ(RTC_YEAR);
+
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+/*
+ * We know that x86-64 always uses BCD format, no need to check the config
+ * register.
+ */
+
+	    BCD_TO_BIN(sec);
+	    BCD_TO_BIN(min);
+	    BCD_TO_BIN(hour);
+	    BCD_TO_BIN(day);
+	    BCD_TO_BIN(mon);
+	    BCD_TO_BIN(year);
+
+/*
+ * x86-64 systems only exists since 2002.
+ * This will work up to Dec 31, 2100
+ */
+	year += 2000;
+
+	return mktime(year, mon, day, hour, min, sec);
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+   changes.
+   
+   RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
+   not that important because current Opteron setups do not support
+   scaling on SMP anyroads.
+
+   Should fix up last_tsc too. Currently gettimeofday in the
+   first tick after the change will be slightly wrong. */
+
+#include <linux/workqueue.h>
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(void *v)
+{
+	unsigned int cpu;
+	for_each_online_cpu(cpu) {
+		cpufreq_get(cpu);
+	}
+	cpufreq_delayed_issched = 0;
+}
+
+/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
+ * to verify the CPU frequency the timing core thinks the CPU is running
+ * at is still correct.
+ */
+static void cpufreq_delayed_get(void)
+{
+	static int warned;
+	if (cpufreq_init && !cpufreq_delayed_issched) {
+		cpufreq_delayed_issched = 1;
+		if (!warned) {
+			warned = 1;
+			printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
+		}
+		schedule_work(&cpufreq_delayed_get_work);
+	}
+}
+
+static unsigned int  ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+
+static unsigned long cpu_khz_ref = 0;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				 void *data)
+{
+        struct cpufreq_freqs *freq = data;
+	unsigned long *lpj, dummy;
+
+	lpj = &dummy;
+	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+	lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+#else
+	lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+
+
+
+	if (!ref_freq) {
+		ref_freq = freq->old;
+		loops_per_jiffy_ref = *lpj;
+		cpu_khz_ref = cpu_khz;
+	}
+        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE)) {
+                *lpj =
+		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+			vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+	}
+	
+	set_cyc2ns_scale(cpu_khz_ref / 1000);
+
+	return 0;
+}
+ 
+static struct notifier_block time_cpufreq_notifier_block = {
+         .notifier_call  = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+	if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+				       CPUFREQ_TRANSITION_NOTIFIER))
+		cpufreq_init = 1;
+	return 0;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/*
+ * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
+ * it to the HPET timer of known frequency.
+ */
+
+#define TICK_COUNT 100000000
+
+static unsigned int __init hpet_calibrate_tsc(void)
+{
+	int tsc_start, hpet_start;
+	int tsc_now, hpet_now;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	local_irq_disable();
+
+	hpet_start = hpet_readl(HPET_COUNTER);
+	rdtscl(tsc_start);
+
+	do {
+		local_irq_disable();
+		hpet_now = hpet_readl(HPET_COUNTER);
+		sync_core();
+		rdtscl(tsc_now);
+		local_irq_restore(flags);
+	} while ((tsc_now - tsc_start) < TICK_COUNT &&
+		 (hpet_now - hpet_start) < TICK_COUNT);
+
+	return (tsc_now - tsc_start) * 1000000000L
+		/ ((hpet_now - hpet_start) * hpet_period / 1000);
+}
+
+
+/*
+ * pit_calibrate_tsc() uses the speaker output (channel 2) of
+ * the PIT. This is better than using the timer interrupt output,
+ * because we can read the value of the speaker with just one inb(),
+ * where we need three i/o operations for the interrupt channel.
+ * We count how many ticks the TSC does in 50 ms.
+ */
+
+static unsigned int __init pit_calibrate_tsc(void)
+{
+	unsigned long start, end;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+	outb(0xb0, 0x43);
+	outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
+	outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+	rdtscll(start);
+	sync_core();
+	while ((inb(0x61) & 0x20) == 0);
+	sync_core();
+	rdtscll(end);
+
+	spin_unlock_irqrestore(&i8253_lock, flags);
+	
+	return (end - start) / 50;
+}
+
+#ifdef	CONFIG_HPET
+static __init int late_hpet_init(void)
+{
+	struct hpet_data	hd;
+	unsigned int 		ntimer;
+
+	if (!vxtime.hpet_address)
+          return -1;
+
+	memset(&hd, 0, sizeof (hd));
+
+	ntimer = hpet_readl(HPET_ID);
+	ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+	ntimer++;
+
+	/*
+	 * Register with driver.
+	 * Timer0 and Timer1 is used by platform.
+	 */
+	hd.hd_phys_address = vxtime.hpet_address;
+	hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE);
+	hd.hd_nirqs = ntimer;
+	hd.hd_flags = HPET_DATA_PLATFORM;
+	hpet_reserve_timer(&hd, 0);
+#ifdef	CONFIG_HPET_EMULATE_RTC
+	hpet_reserve_timer(&hd, 1);
+#endif
+	hd.hd_irq[0] = HPET_LEGACY_8254;
+	hd.hd_irq[1] = HPET_LEGACY_RTC;
+	if (ntimer > 2) {
+		struct hpet		*hpet;
+		struct hpet_timer	*timer;
+		int			i;
+
+		hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
+
+		for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
+		     timer++, i++)
+			hd.hd_irq[i] = (timer->hpet_config &
+					Tn_INT_ROUTE_CNF_MASK) >>
+				Tn_INT_ROUTE_CNF_SHIFT;
+
+	}
+
+	hpet_alloc(&hd);
+	return 0;
+}
+fs_initcall(late_hpet_init);
+#endif
+
+static int hpet_timer_stop_set_go(unsigned long tick)
+{
+	unsigned int cfg;
+
+/*
+ * Stop the timers and reset the main counter.
+ */
+
+	cfg = hpet_readl(HPET_CFG);
+	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+	hpet_writel(cfg, HPET_CFG);
+	hpet_writel(0, HPET_COUNTER);
+	hpet_writel(0, HPET_COUNTER + 4);
+
+/*
+ * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
+ * and period also hpet_tick.
+ */
+
+	hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+		    HPET_TN_32BIT, HPET_T0_CFG);
+	hpet_writel(hpet_tick, HPET_T0_CMP);
+	hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
+
+/*
+ * Go!
+ */
+
+	cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
+	hpet_writel(cfg, HPET_CFG);
+
+	return 0;
+}
+
+static int hpet_init(void)
+{
+	unsigned int id;
+
+	if (!vxtime.hpet_address)
+		return -1;
+	set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
+	__set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+
+/*
+ * Read the period, compute tick and quotient.
+ */
+
+	id = hpet_readl(HPET_ID);
+
+	if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
+	    !(id & HPET_ID_LEGSUP))
+		return -1;
+
+	hpet_period = hpet_readl(HPET_PERIOD);
+	if (hpet_period < 100000 || hpet_period > 100000000)
+		return -1;
+
+	hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
+		hpet_period;
+
+	return hpet_timer_stop_set_go(hpet_tick);
+}
+
+static int hpet_reenable(void)
+{
+	return hpet_timer_stop_set_go(hpet_tick);
+}
+
+void __init pit_init(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+	outb_p(0x34, 0x43);		/* binary, mode 2, LSB/MSB, ch 0 */
+	outb_p(LATCH & 0xff, 0x40);	/* LSB */
+	outb_p(LATCH >> 8, 0x40);	/* MSB */
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+int __init time_setup(char *str)
+{
+	report_lost_ticks = 1;
+	return 1;
+}
+
+static struct irqaction irq0 = {
+	timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+};
+
+extern void __init config_acpi_tables(void);
+
+void __init time_init(void)
+{
+	char *timename;
+
+#ifdef HPET_HACK_ENABLE_DANGEROUS
+        if (!vxtime.hpet_address) {
+		printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
+		       "manually!\n");
+                outl(0x800038a0, 0xcf8);
+                outl(0xff000001, 0xcfc);
+                outl(0x800038a0, 0xcf8);
+                vxtime.hpet_address = inl(0xcfc) & 0xfffffffe;
+		printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
+		       "at %#lx.\n", vxtime.hpet_address);
+        }
+#endif
+	if (nohpet)
+		vxtime.hpet_address = 0;
+
+	xtime.tv_sec = get_cmos_time();
+	xtime.tv_nsec = 0;
+
+	set_normalized_timespec(&wall_to_monotonic,
+	                        -xtime.tv_sec, -xtime.tv_nsec);
+
+	if (!hpet_init()) {
+                vxtime_hz = (1000000000000000L + hpet_period / 2) /
+			hpet_period;
+		cpu_khz = hpet_calibrate_tsc();
+		timename = "HPET";
+	} else {
+		pit_init();
+		cpu_khz = pit_calibrate_tsc();
+		timename = "PIT";
+	}
+
+	printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n",
+	       vxtime_hz / 1000000, vxtime_hz % 1000000, timename);
+	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
+		cpu_khz / 1000, cpu_khz % 1000);
+	vxtime.mode = VXTIME_TSC;
+	vxtime.quot = (1000000L << 32) / vxtime_hz;
+	vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+	vxtime.hz = vxtime_hz;
+	rdtscll_sync(&vxtime.last_tsc);
+	setup_irq(0, &irq0);
+
+	set_cyc2ns_scale(cpu_khz / 1000);
+}
+
+void __init time_init_smp(void)
+{
+	char *timetype;
+
+	/*
+	 * AMD systems with more than one CPU don't have fully synchronized
+	 * TSCs. Always use HPET gettimeofday for these, although it is slower.
+	 * Intel SMP systems usually have synchronized TSCs, so use always
+	 * the TSC.
+	 *
+	 * Exceptions:
+	 * IBM Summit2 checked by oem_force_hpet_timer().
+ 	 * AMD dual core may also not need HPET. Check me.
+	 *
+	 * Can be turned off with "notsc".
+	 */
+	if (num_online_cpus() > 1 &&
+	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		notsc = 1;
+	/* Some systems will want to disable TSC and use HPET. */
+	if (oem_force_hpet_timer())
+		notsc = 1;
+	if (vxtime.hpet_address && notsc) {
+		timetype = "HPET";
+		vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+		vxtime.mode = VXTIME_HPET;
+		do_gettimeoffset = do_gettimeoffset_hpet;
+	} else {
+		timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
+		vxtime.mode = VXTIME_TSC;
+	}
+
+	printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
+}
+
+__setup("report_lost_ticks", time_setup);
+
+static long clock_cmos_diff;
+static unsigned long sleep_start;
+
+static int timer_suspend(struct sys_device *dev, u32 state)
+{
+	/*
+	 * Estimate time zone so that set_time can update the clock
+	 */
+	long cmos_time =  get_cmos_time();
+
+	clock_cmos_diff = -cmos_time;
+	clock_cmos_diff += get_seconds();
+	sleep_start = cmos_time;
+	return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+	unsigned long sec;
+	unsigned long ctime = get_cmos_time();
+	unsigned long sleep_length = (ctime - sleep_start) * HZ;
+
+	if (vxtime.hpet_address)
+		hpet_reenable();
+	else
+		i8254_timer_resume();
+
+	sec = ctime + clock_cmos_diff;
+	write_seqlock_irqsave(&xtime_lock,flags);
+	xtime.tv_sec = sec;
+	xtime.tv_nsec = 0;
+	write_sequnlock_irqrestore(&xtime_lock,flags);
+	jiffies += sleep_length;
+	wall_jiffies += sleep_length;
+	return 0;
+}
+
+static struct sysdev_class timer_sysclass = {
+	.resume = timer_resume,
+	.suspend = timer_suspend,
+	set_kset_name("timer"),
+};
+
+
+/* XXX this driverfs stuff should probably go elsewhere later -john */
+static struct sys_device device_timer = {
+	.id	= 0,
+	.cls	= &timer_sysclass,
+};
+
+static int time_init_device(void)
+{
+	int error = sysdev_class_register(&timer_sysclass);
+	if (!error)
+		error = sysdev_register(&device_timer);
+	return error;
+}
+
+device_initcall(time_init_device);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/rtc.h>
+
+extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
+
+#define DEFAULT_RTC_INT_FREQ 	64
+#define RTC_NUM_INTS 		1
+
+static unsigned long UIE_on;
+static unsigned long prev_update_sec;
+
+static unsigned long AIE_on;
+static struct rtc_time alarm_time;
+
+static unsigned long PIE_on;
+static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
+static unsigned long PIE_count;
+
+static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+
+int is_hpet_enabled(void)
+{
+	return vxtime.hpet_address != 0;
+}
+
+/*
+ * Timer 1 for RTC, we do not use periodic interrupt feature,
+ * even if HPET supports periodic interrupts on Timer 1.
+ * The reason being, to set up a periodic interrupt in HPET, we need to
+ * stop the main counter. And if we do that everytime someone diables/enables
+ * RTC, we will have adverse effect on main kernel timer running on Timer 0.
+ * So, for the time being, simulate the periodic interrupt in software.
+ *
+ * hpet_rtc_timer_init() is called for the first time and during subsequent
+ * interuppts reinit happens through hpet_rtc_timer_reinit().
+ */
+int hpet_rtc_timer_init(void)
+{
+	unsigned int cfg, cnt;
+	unsigned long flags;
+
+	if (!is_hpet_enabled())
+		return 0;
+	/*
+	 * Set the counter 1 and enable the interrupts.
+	 */
+	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+		hpet_rtc_int_freq = PIE_freq;
+	else
+		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+	local_irq_save(flags);
+	cnt = hpet_readl(HPET_COUNTER);
+	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
+	hpet_writel(cnt, HPET_T1_CMP);
+	local_irq_restore(flags);
+
+	cfg = hpet_readl(HPET_T1_CFG);
+	cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_T1_CFG);
+
+	return 1;
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+	unsigned int cfg, cnt;
+
+	if (!(PIE_on | AIE_on | UIE_on))
+		return;
+
+	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+		hpet_rtc_int_freq = PIE_freq;
+	else
+		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+	/* It is more accurate to use the comparator value than current count.*/
+	cnt = hpet_readl(HPET_T1_CMP);
+	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
+	hpet_writel(cnt, HPET_T1_CMP);
+
+	cfg = hpet_readl(HPET_T1_CFG);
+	cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_T1_CFG);
+
+	return;
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (bit_mask & RTC_UIE)
+		UIE_on = 0;
+	if (bit_mask & RTC_PIE)
+		PIE_on = 0;
+	if (bit_mask & RTC_AIE)
+		AIE_on = 0;
+
+	return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+	int timer_init_reqd = 0;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (!(PIE_on | AIE_on | UIE_on))
+		timer_init_reqd = 1;
+
+	if (bit_mask & RTC_UIE) {
+		UIE_on = 1;
+	}
+	if (bit_mask & RTC_PIE) {
+		PIE_on = 1;
+		PIE_count = 0;
+	}
+	if (bit_mask & RTC_AIE) {
+		AIE_on = 1;
+	}
+
+	if (timer_init_reqd)
+		hpet_rtc_timer_init();
+
+	return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	alarm_time.tm_hour = hrs;
+	alarm_time.tm_min = min;
+	alarm_time.tm_sec = sec;
+
+	return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	PIE_freq = freq;
+	PIE_count = 0;
+
+	return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	return 1;
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct rtc_time curr_time;
+	unsigned long rtc_int_flag = 0;
+	int call_rtc_interrupt = 0;
+
+	hpet_rtc_timer_reinit();
+
+	if (UIE_on | AIE_on) {
+		rtc_get_rtc_time(&curr_time);
+	}
+	if (UIE_on) {
+		if (curr_time.tm_sec != prev_update_sec) {
+			/* Set update int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag = RTC_UF;
+			prev_update_sec = curr_time.tm_sec;
+		}
+	}
+	if (PIE_on) {
+		PIE_count++;
+		if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
+			/* Set periodic int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag |= RTC_PF;
+			PIE_count = 0;
+		}
+	}
+	if (AIE_on) {
+		if ((curr_time.tm_sec == alarm_time.tm_sec) &&
+		    (curr_time.tm_min == alarm_time.tm_min) &&
+		    (curr_time.tm_hour == alarm_time.tm_hour)) {
+			/* Set alarm int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag |= RTC_AF;
+		}
+	}
+	if (call_rtc_interrupt) {
+		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+		rtc_interrupt(rtc_int_flag, dev_id, regs);
+	}
+	return IRQ_HANDLED;
+}
+#endif
+
+
+
+static int __init nohpet_setup(char *s) 
+{ 
+	nohpet = 1;
+	return 0;
+} 
+
+__setup("nohpet", nohpet_setup);
+
+
+static int __init notsc_setup(char *s)
+{
+	notsc = 1;
+	return 0;
+}
+
+__setup("notsc", notsc_setup);
+
+
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
new file mode 100644
index 000000000000..6d9c9a8e7d0e
--- /dev/null
+++ b/arch/x86_64/kernel/trampoline.S
@@ -0,0 +1,64 @@
+/*
+ *
+ *	Trampoline.S	Derived from Setup.S by Linus Torvalds
+ *
+ *	4 Jan 1997 Michael Chastain: changed to gnu as.
+ *
+ *	Entry: CS:IP point to the start of our code, we are 
+ *	in real mode with no stack, but the rest of the 
+ *	trampoline page to make our stack and everything else
+ *	is a mystery.
+ *
+ *	In fact we don't actually need a stack so we don't
+ *	set one up.
+ *
+ *	On entry to trampoline_data, the processor is in real mode
+ *	with 16-bit addressing and 16-bit data.  CS has some value
+ *	and IP is zero.  Thus, data addresses need to be absolute
+ *	(no relocation) and are taken with regard to r_base.
+ *
+ *	If you work on this file, check the object module with objdump
+ *	--full-contents --reloc to make sure there are no relocation
+ *	entries. For the GDT entry we do hand relocation in smpboot.c
+ *	because of 64bit linker limitations.
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+
+.data
+
+.code16
+
+ENTRY(trampoline_data)
+r_base = .
+	wbinvd	
+	mov	%cs, %ax	# Code and data in the same place
+	mov	%ax, %ds
+
+	cli			# We should be safe anyway
+
+	movl	$0xA5A5A5A5, trampoline_data - r_base
+				# write marker for master knows we're running
+
+	lidt	idt_48 - r_base	# load idt with 0, 0
+	lgdt	gdt_48 - r_base	# load gdt with whatever is appropriate
+
+	xor	%ax, %ax
+	inc	%ax		# protected mode (PE) bit
+	lmsw	%ax		# into protected mode
+	# flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S
+	ljmpl	$__KERNEL32_CS, $(startup_32-__START_KERNEL_map)
+
+	# Careful these need to be in the same 64K segment as the above;
+idt_48:
+	.word	0			# idt limit = 0
+	.word	0, 0			# idt base = 0L
+
+gdt_48:
+	.short	__KERNEL32_CS + 7	# gdt limit
+	.long	cpu_gdt_table-__START_KERNEL_map
+
+.globl trampoline_end
+trampoline_end:	
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
new file mode 100644
index 000000000000..d87ac6496cc2
--- /dev/null
+++ b/arch/x86_64/kernel/traps.c
@@ -0,0 +1,948 @@
+/*
+ *  linux/arch/x86-64/traps.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ *  $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
+ */
+
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'entry.S'.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/kdebug.h>
+#include <asm/processor.h>
+
+#include <asm/smp.h>
+#include <asm/pgalloc.h>
+#include <asm/pda.h>
+#include <asm/proto.h>
+#include <asm/nmi.h>
+
+#include <linux/irq.h>
+
+
+extern struct gate_struct idt_table[256]; 
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void double_fault(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void reserved(void);
+asmlinkage void alignment_check(void);
+asmlinkage void machine_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void call_debug(void);
+
+struct notifier_block *die_chain;
+static DEFINE_SPINLOCK(die_notifier_lock);
+
+int register_die_notifier(struct notifier_block *nb)
+{
+	int err = 0;
+	unsigned long flags;
+	spin_lock_irqsave(&die_notifier_lock, flags);
+	err = notifier_chain_register(&die_chain, nb);
+	spin_unlock_irqrestore(&die_notifier_lock, flags);
+	return err;
+}
+
+static inline void conditional_sti(struct pt_regs *regs)
+{
+	if (regs->eflags & X86_EFLAGS_IF)
+		local_irq_enable();
+}
+
+static int kstack_depth_to_print = 10;
+
+#ifdef CONFIG_KALLSYMS
+#include <linux/kallsyms.h> 
+int printk_address(unsigned long address)
+{ 
+	unsigned long offset = 0, symsize;
+	const char *symname;
+	char *modname;
+	char *delim = ":"; 
+	char namebuf[128];
+
+	symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); 
+	if (!symname) 
+		return printk("[<%016lx>]", address);
+	if (!modname) 
+		modname = delim = ""; 		
+        return printk("<%016lx>{%s%s%s%s%+ld}",
+		      address,delim,modname,delim,symname,offset); 
+} 
+#else
+int printk_address(unsigned long address)
+{ 
+	return printk("[<%016lx>]", address);
+} 
+#endif
+
+unsigned long *in_exception_stack(int cpu, unsigned long stack) 
+{ 
+	int k;
+	for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+		struct tss_struct *tss = &per_cpu(init_tss, cpu);
+		unsigned long start = tss->ist[k] - EXCEPTION_STKSZ;
+
+		if (stack >= start && stack < tss->ist[k])
+			return (unsigned long *)tss->ist[k];
+	}
+	return NULL;
+} 
+
+/*
+ * x86-64 can have upto three kernel stacks: 
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault) hardware stack
+ * Check and process them in order.
+ */
+
+void show_trace(unsigned long *stack)
+{
+	unsigned long addr;
+	unsigned long *irqstack, *irqstack_end, *estack_end;
+	const int cpu = safe_smp_processor_id();
+	int i;
+
+	printk("\nCall Trace:");
+	i = 0; 
+	
+	estack_end = in_exception_stack(cpu, (unsigned long)stack); 
+	if (estack_end) { 
+		while (stack < estack_end) { 
+			addr = *stack++; 
+			if (__kernel_text_address(addr)) {
+				i += printk_address(addr);
+				i += printk(" "); 
+				if (i > 50) {
+					printk("\n"); 
+					i = 0;
+				}
+			}
+		}
+		i += printk(" <EOE> "); 
+		i += 7;
+		stack = (unsigned long *) estack_end[-2]; 
+	}  
+
+	irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
+	irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64);
+
+	if (stack >= irqstack && stack < irqstack_end) {
+		printk("<IRQ> ");  
+		while (stack < irqstack_end) {
+			addr = *stack++;
+			/*
+			 * If the address is either in the text segment of the
+			 * kernel, or in the region which contains vmalloc'ed
+			 * memory, it *may* be the address of a calling
+			 * routine; if so, print it so that someone tracing
+			 * down the cause of the crash will be able to figure
+			 * out the call path that was taken.
+			 */
+			 if (__kernel_text_address(addr)) {
+				 i += printk_address(addr);
+				 i += printk(" "); 
+				 if (i > 50) { 
+					printk("\n       ");
+					 i = 0;
+				 } 
+			}
+		} 
+		stack = (unsigned long *) (irqstack_end[-1]);
+		printk(" <EOI> ");
+		i += 7;
+	} 
+
+	while (((long) stack & (THREAD_SIZE-1)) != 0) {
+		addr = *stack++;
+		if (__kernel_text_address(addr)) {
+			i += printk_address(addr);
+			i += printk(" "); 
+			if (i > 50) { 
+				printk("\n       ");
+					 i = 0;
+			} 
+		}
+	}
+	printk("\n");
+}
+
+void show_stack(struct task_struct *tsk, unsigned long * rsp)
+{
+	unsigned long *stack;
+	int i;
+	const int cpu = safe_smp_processor_id();
+	unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
+	unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE);    
+
+	// debugging aid: "show_stack(NULL, NULL);" prints the
+	// back trace for this cpu.
+
+	if (rsp == NULL) {
+		if (tsk)
+			rsp = (unsigned long *)tsk->thread.rsp;
+		else
+			rsp = (unsigned long *)&rsp;
+	}
+
+	stack = rsp;
+	for(i=0; i < kstack_depth_to_print; i++) {
+		if (stack >= irqstack && stack <= irqstack_end) {
+			if (stack == irqstack_end) {
+				stack = (unsigned long *) (irqstack_end[-1]);
+				printk(" <EOI> ");
+			}
+		} else {
+		if (((long) stack & (THREAD_SIZE-1)) == 0)
+			break;
+		}
+		if (i && ((i % 4) == 0))
+			printk("\n       ");
+		printk("%016lx ", *stack++);
+	}
+	show_trace((unsigned long *)rsp);
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+	unsigned long dummy;
+	show_trace(&dummy);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+	int i;
+	int in_kernel = (regs->cs & 3) == 0;
+	unsigned long rsp;
+	const int cpu = safe_smp_processor_id(); 
+	struct task_struct *cur = cpu_pda[cpu].pcurrent; 
+
+		rsp = regs->rsp;
+
+	printk("CPU %d ", cpu);
+	__show_regs(regs);
+	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+		cur->comm, cur->pid, cur->thread_info, cur);
+
+	/*
+	 * When in-kernel, we also print out the stack and code at the
+	 * time of the fault..
+	 */
+	if (in_kernel) {
+
+		printk("Stack: ");
+		show_stack(NULL, (unsigned long*)rsp);
+
+		printk("\nCode: ");
+		if(regs->rip < PAGE_OFFSET)
+			goto bad;
+
+		for(i=0;i<20;i++)
+		{
+			unsigned char c;
+			if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
+bad:
+				printk(" Bad RIP value.");
+				break;
+			}
+			printk("%02x ", c);
+		}
+	}
+	printk("\n");
+}	
+
+void handle_BUG(struct pt_regs *regs)
+{ 
+	struct bug_frame f;
+	char tmp;
+
+	if (regs->cs & 3)
+		return; 
+	if (__copy_from_user(&f, (struct bug_frame *) regs->rip, 
+			     sizeof(struct bug_frame)))
+		return; 
+	if ((unsigned long)f.filename < __PAGE_OFFSET || 
+	    f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 
+		return;
+	if (__get_user(tmp, f.filename))
+		f.filename = "unmapped filename"; 
+	printk("----------- [cut here ] --------- [please bite here ] ---------\n");
+	printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line);
+} 
+
+void out_of_line_bug(void)
+{ 
+	BUG(); 
+} 
+
+static DEFINE_SPINLOCK(die_lock);
+static int die_owner = -1;
+
+void oops_begin(void)
+{
+	int cpu = safe_smp_processor_id(); 
+	/* racy, but better than risking deadlock. */ 
+	local_irq_disable();
+	if (!spin_trylock(&die_lock)) { 
+		if (cpu == die_owner) 
+			/* nested oops. should stop eventually */;
+		else
+			spin_lock(&die_lock); 
+	}
+	die_owner = cpu; 
+	console_verbose();
+	bust_spinlocks(1); 
+}
+
+void oops_end(void)
+{ 
+	die_owner = -1;
+	bust_spinlocks(0); 
+	spin_unlock(&die_lock); 
+	if (panic_on_oops)
+		panic("Oops"); 
+} 
+
+void __die(const char * str, struct pt_regs * regs, long err)
+{
+	static int die_counter;
+	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+#ifdef CONFIG_PREEMPT
+	printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+	printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk("DEBUG_PAGEALLOC");
+#endif
+	printk("\n");
+	notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
+	show_registers(regs);
+	/* Executive summary in case the oops scrolled away */
+	printk(KERN_ALERT "RIP ");
+	printk_address(regs->rip); 
+	printk(" RSP <%016lx>\n", regs->rsp); 
+}
+
+void die(const char * str, struct pt_regs * regs, long err)
+{
+	oops_begin();
+	handle_BUG(regs);
+	__die(str, regs, err);
+	oops_end();
+	do_exit(SIGSEGV); 
+}
+static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
+{
+	if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS))
+		die(str, regs, err);
+}
+
+void die_nmi(char *str, struct pt_regs *regs)
+{
+	oops_begin();
+	/*
+	 * We are in trouble anyway, lets at least try
+	 * to get a message out.
+	 */
+	printk(str, safe_smp_processor_id());
+	show_registers(regs);
+	if (panic_on_timeout || panic_on_oops)
+		panic("nmi watchdog");
+	printk("console shuts up ...\n");
+	oops_end();
+	do_exit(SIGSEGV);
+}
+
+static void do_trap(int trapnr, int signr, char *str, 
+			   struct pt_regs * regs, long error_code, siginfo_t *info)
+{
+	conditional_sti(regs);
+
+#ifdef CONFIG_CHECKING
+       { 
+               unsigned long gs; 
+               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
+               rdmsrl(MSR_GS_BASE, gs); 
+               if (gs != (unsigned long)pda) { 
+                       wrmsrl(MSR_GS_BASE, pda); 
+                       printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
+			      regs->rip);
+               }
+       }
+#endif
+
+	if ((regs->cs & 3)  != 0) { 
+		struct task_struct *tsk = current;
+
+		if (exception_trace && unhandled_signal(tsk, signr))
+			printk(KERN_INFO
+			       "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
+			       tsk->comm, tsk->pid, str,
+			       regs->rip,regs->rsp,error_code); 
+
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+		if (info)
+			force_sig_info(signr, info, tsk);
+		else
+			force_sig(signr, tsk);
+		return;
+	}
+
+
+	/* kernel trap */ 
+	{	     
+		const struct exception_table_entry *fixup;
+		fixup = search_exception_tables(regs->rip);
+		if (fixup) {
+			regs->rip = fixup->fixup;
+		} else	
+			die(str, regs, error_code);
+		return;
+	}
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+							== NOTIFY_STOP) \
+		return; \
+	do_trap(trapnr, signr, str, regs, error_code, NULL); \
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+	siginfo_t info; \
+	info.si_signo = signr; \
+	info.si_errno = 0; \
+	info.si_code = sicode; \
+	info.si_addr = (void __user *)siaddr; \
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+							== NOTIFY_STOP) \
+		return; \
+	do_trap(trapnr, signr, str, regs, error_code, &info); \
+}
+
+DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
+DO_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL,  "invalid operand", invalid_op, ILL_ILLOPN, regs->rip)
+DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
+DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(18, SIGSEGV, "reserved", reserved)
+
+#define DO_ERROR_STACK(trapnr, signr, str, name) \
+asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \
+{ \
+	struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+							== NOTIFY_STOP) \
+		return regs; \
+	if (regs->cs & 3) { \
+		memcpy(pr, regs, sizeof(struct pt_regs)); \
+		regs = pr; \
+	} \
+	do_trap(trapnr, signr, str, regs, error_code, NULL); \
+	return regs;		\
+}
+
+DO_ERROR_STACK(12, SIGBUS,  "stack segment", stack_segment)
+DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault)
+
+asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
+{
+	conditional_sti(regs);
+
+#ifdef CONFIG_CHECKING
+       { 
+               unsigned long gs; 
+               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
+               rdmsrl(MSR_GS_BASE, gs); 
+               if (gs != (unsigned long)pda) { 
+                       wrmsrl(MSR_GS_BASE, pda); 
+		       oops_in_progress++;
+                       printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
+		       oops_in_progress--;
+               }
+       }
+#endif
+
+	if ((regs->cs & 3)!=0) { 
+		struct task_struct *tsk = current;
+
+		if (exception_trace && unhandled_signal(tsk, SIGSEGV))
+			printk(KERN_INFO
+		       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
+			       tsk->comm, tsk->pid,
+			       regs->rip,regs->rsp,error_code); 
+
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = 13;
+		force_sig(SIGSEGV, tsk);
+		return;
+	} 
+
+	/* kernel gp */
+	{
+		const struct exception_table_entry *fixup;
+		fixup = search_exception_tables(regs->rip);
+		if (fixup) {
+			regs->rip = fixup->fixup;
+			return;
+		}
+		if (notify_die(DIE_GPF, "general protection fault", regs,
+					error_code, 13, SIGSEGV) == NOTIFY_STOP)
+			return;
+		die("general protection fault", regs, error_code);
+	}
+}
+
+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+	printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
+	printk("You probably have a hardware problem with your RAM chips\n");
+
+	/* Clear and disable the memory parity error line. */
+	reason = (reason & 0xf) | 4;
+	outb(reason, 0x61);
+}
+
+static void io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+	printk("NMI: IOCK error (debug interrupt?)\n");
+	show_registers(regs);
+
+	/* Re-enable the IOCK line, wait for a few seconds */
+	reason = (reason & 0xf) | 8;
+	outb(reason, 0x61);
+	mdelay(2000);
+	reason &= ~8;
+	outb(reason, 0x61);
+}
+
+static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{	printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
+	printk("Dazed and confused, but trying to continue\n");
+	printk("Do you have a strange power saving mode enabled?\n");
+}
+
+asmlinkage void default_do_nmi(struct pt_regs *regs)
+{
+	unsigned char reason = 0;
+
+	/* Only the BSP gets external NMIs from the system.  */
+	if (!smp_processor_id())
+		reason = get_nmi_reason();
+
+	if (!(reason & 0xc0)) {
+		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
+								== NOTIFY_STOP)
+			return;
+#ifdef CONFIG_X86_LOCAL_APIC
+		/*
+		 * Ok, so this is none of the documented NMI sources,
+		 * so it must be the NMI watchdog.
+		 */
+		if (nmi_watchdog > 0) {
+			nmi_watchdog_tick(regs,reason);
+			return;
+		}
+#endif
+		unknown_nmi_error(reason, regs);
+		return;
+	}
+	if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
+		return; 
+
+	/* AK: following checks seem to be broken on modern chipsets. FIXME */
+
+	if (reason & 0x80)
+		mem_parity_error(reason, regs);
+	if (reason & 0x40)
+		io_check_error(reason, regs);
+}
+
+asmlinkage void do_int3(struct pt_regs * regs, long error_code)
+{
+	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+		return;
+	}
+	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+	return;
+}
+
+/* runs on IST stack. */
+asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code)
+{
+	struct pt_regs *pr;
+	unsigned long condition;
+	struct task_struct *tsk = current;
+	siginfo_t info;
+
+	pr = (struct pt_regs *)(current->thread.rsp0)-1;
+	if (regs->cs & 3) {
+		memcpy(pr, regs, sizeof(struct pt_regs));
+		regs = pr;
+	}	
+
+#ifdef CONFIG_CHECKING
+       { 
+	       /* RED-PEN interaction with debugger - could destroy gs */
+               unsigned long gs; 
+               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
+               rdmsrl(MSR_GS_BASE, gs); 
+               if (gs != (unsigned long)pda) { 
+                       wrmsrl(MSR_GS_BASE, pda); 
+                       printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
+               }
+       }
+#endif
+
+	asm("movq %%db6,%0" : "=r" (condition));
+
+	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+						SIGTRAP) == NOTIFY_STOP) {
+		return regs;
+	}
+	conditional_sti(regs);
+
+	/* Mask out spurious debug traps due to lazy DR7 setting */
+	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+		if (!tsk->thread.debugreg7) { 
+			goto clear_dr7;
+		}
+	}
+
+	tsk->thread.debugreg6 = condition;
+
+	/* Mask out spurious TF errors due to lazy TF clearing */
+	if ((condition & DR_STEP) &&
+	    (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition,
+			1, SIGTRAP) != NOTIFY_STOP)) {
+		/*
+		 * The TF error should be masked out only if the current
+		 * process is not traced and if the TRAP flag has been set
+		 * previously by a tracing process (condition detected by
+		 * the PT_DTRACE flag); remember that the i386 TRAP flag
+		 * can be modified by the process itself in user mode,
+		 * allowing programs to debug themselves without the ptrace()
+		 * interface.
+		 */
+                if ((regs->cs & 3) == 0)
+                       goto clear_TF_reenable;
+		if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE)
+			goto clear_TF;
+	}
+
+	/* Ok, finally something we can handle */
+	tsk->thread.trap_no = 1;
+	tsk->thread.error_code = error_code;
+	info.si_signo = SIGTRAP;
+	info.si_errno = 0;
+	info.si_code = TRAP_BRKPT;
+	if ((regs->cs & 3) == 0) 
+		goto clear_dr7; 
+
+	info.si_addr = (void __user *)regs->rip;
+	force_sig_info(SIGTRAP, &info, tsk);	
+clear_dr7:
+	asm volatile("movq %0,%%db7"::"r"(0UL));
+	notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP);
+	return regs;
+
+clear_TF_reenable:
+	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+
+clear_TF:
+	/* RED-PEN could cause spurious errors */
+	if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP) 
+								!= NOTIFY_STOP)
+	regs->eflags &= ~TF_MASK;
+	return regs;	
+}
+
+static int kernel_math_error(struct pt_regs *regs, char *str)
+{
+	const struct exception_table_entry *fixup;
+	fixup = search_exception_tables(regs->rip);
+	if (fixup) {
+		regs->rip = fixup->fixup;
+		return 1;
+	}
+	notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE);
+#if 0
+	/* This should be a die, but warn only for now */
+	die(str, regs, 0);
+#else
+	printk(KERN_DEBUG "%s: %s at ", current->comm, str);
+	printk_address(regs->rip);
+	printk("\n");
+#endif
+	return 0;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+asmlinkage void do_coprocessor_error(struct pt_regs *regs)
+{
+	void __user *rip = (void __user *)(regs->rip);
+	struct task_struct * task;
+	siginfo_t info;
+	unsigned short cwd, swd;
+
+	conditional_sti(regs);
+	if ((regs->cs & 3) == 0 &&
+	    kernel_math_error(regs, "kernel x87 math error"))
+		return;
+
+	/*
+	 * Save the info for the exception handler and clear the error.
+	 */
+	task = current;
+	save_init_fpu(task);
+	task->thread.trap_no = 16;
+	task->thread.error_code = 0;
+	info.si_signo = SIGFPE;
+	info.si_errno = 0;
+	info.si_code = __SI_FAULT;
+	info.si_addr = rip;
+	/*
+	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+	 * status.  0x3f is the exception bits in these regs, 0x200 is the
+	 * C1 reg you need in case of a stack fault, 0x040 is the stack
+	 * fault bit.  We should only be taking one exception at a time,
+	 * so if this combination doesn't produce any single exception,
+	 * then we have a bad program that isn't synchronizing its FPU usage
+	 * and it will suffer the consequences since we won't be able to
+	 * fully reproduce the context of the exception
+	 */
+	cwd = get_fpu_cwd(task);
+	swd = get_fpu_swd(task);
+	switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) {
+		case 0x000:
+		default:
+			break;
+		case 0x001: /* Invalid Op */
+		case 0x041: /* Stack Fault */
+		case 0x241: /* Stack Fault | Direction */
+			info.si_code = FPE_FLTINV;
+			break;
+		case 0x002: /* Denormalize */
+		case 0x010: /* Underflow */
+			info.si_code = FPE_FLTUND;
+			break;
+		case 0x004: /* Zero Divide */
+			info.si_code = FPE_FLTDIV;
+			break;
+		case 0x008: /* Overflow */
+			info.si_code = FPE_FLTOVF;
+			break;
+		case 0x020: /* Precision */
+			info.si_code = FPE_FLTRES;
+			break;
+	}
+	force_sig_info(SIGFPE, &info, task);
+}
+
+asmlinkage void bad_intr(void)
+{
+	printk("bad interrupt"); 
+}
+
+asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
+{
+	void __user *rip = (void __user *)(regs->rip);
+	struct task_struct * task;
+	siginfo_t info;
+	unsigned short mxcsr;
+
+	conditional_sti(regs);
+	if ((regs->cs & 3) == 0 &&
+        	kernel_math_error(regs, "simd math error"))
+		return;
+
+	/*
+	 * Save the info for the exception handler and clear the error.
+	 */
+	task = current;
+	save_init_fpu(task);
+	task->thread.trap_no = 19;
+	task->thread.error_code = 0;
+	info.si_signo = SIGFPE;
+	info.si_errno = 0;
+	info.si_code = __SI_FAULT;
+	info.si_addr = rip;
+	/*
+	 * The SIMD FPU exceptions are handled a little differently, as there
+	 * is only a single status/control register.  Thus, to determine which
+	 * unmasked exception was caught we must mask the exception mask bits
+	 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+	 */
+	mxcsr = get_fpu_mxcsr(task);
+	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+		case 0x000:
+		default:
+			break;
+		case 0x001: /* Invalid Op */
+			info.si_code = FPE_FLTINV;
+			break;
+		case 0x002: /* Denormalize */
+		case 0x010: /* Underflow */
+			info.si_code = FPE_FLTUND;
+			break;
+		case 0x004: /* Zero Divide */
+			info.si_code = FPE_FLTDIV;
+			break;
+		case 0x008: /* Overflow */
+			info.si_code = FPE_FLTOVF;
+			break;
+		case 0x020: /* Precision */
+			info.si_code = FPE_FLTRES;
+			break;
+	}
+	force_sig_info(SIGFPE, &info, task);
+}
+
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
+{
+}
+
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+
+/*
+ *  'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ */
+asmlinkage void math_state_restore(void)
+{
+	struct task_struct *me = current;
+	clts();			/* Allow maths ops (or we recurse) */
+
+	if (!used_math())
+		init_fpu(me);
+	restore_fpu_checking(&me->thread.i387.fxsave);
+	me->thread_info->status |= TS_USEDFPU;
+}
+
+void do_call_debug(struct pt_regs *regs) 
+{ 
+	notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT); 
+}
+
+void __init trap_init(void)
+{
+	set_intr_gate(0,&divide_error);
+	set_intr_gate_ist(1,&debug,DEBUG_STACK);
+	set_intr_gate_ist(2,&nmi,NMI_STACK);
+	set_system_gate(3,&int3);
+	set_system_gate(4,&overflow);	/* int4-5 can be called from all */
+	set_system_gate(5,&bounds);
+	set_intr_gate(6,&invalid_op);
+	set_intr_gate(7,&device_not_available);
+	set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
+	set_intr_gate(9,&coprocessor_segment_overrun);
+	set_intr_gate(10,&invalid_TSS);
+	set_intr_gate(11,&segment_not_present);
+	set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
+	set_intr_gate(13,&general_protection);
+	set_intr_gate(14,&page_fault);
+	set_intr_gate(15,&spurious_interrupt_bug);
+	set_intr_gate(16,&coprocessor_error);
+	set_intr_gate(17,&alignment_check);
+#ifdef CONFIG_X86_MCE
+	set_intr_gate_ist(18,&machine_check, MCE_STACK); 
+#endif
+	set_intr_gate(19,&simd_coprocessor_error);
+
+#ifdef CONFIG_IA32_EMULATION
+	set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+#endif
+       
+	set_intr_gate(KDB_VECTOR, call_debug);
+       
+	/*
+	 * Should be a barrier for any external CPU state.
+	 */
+	cpu_init();
+}
+
+
+/* Actual parsing is done early in setup.c. */
+static int __init oops_dummy(char *s)
+{ 
+	panic_on_oops = 1;
+	return -1; 
+} 
+__setup("oops=", oops_dummy); 
+
+static int __init kstack_setup(char *s)
+{
+	kstack_depth_to_print = simple_strtoul(s,NULL,0);
+	return 0;
+}
+__setup("kstack=", kstack_setup);
+
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..59ebd5beda87
--- /dev/null
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -0,0 +1,164 @@
+/* ld script to make x86-64 Linux kernel
+ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ */
+
+#include <asm-generic/vmlinux.lds.h>
+#include <linux/config.h>
+
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+SECTIONS
+{
+  . = 0xffffffff80100000;
+  phys_startup_64 = startup_64 - LOAD_OFFSET;
+  _text = .;			/* Text and read-only data */
+  .text : {
+	*(.text)
+	SCHED_TEXT
+	LOCK_TEXT
+	*(.fixup)
+	*(.gnu.warning)
+	} = 0x9090
+  .text.lock : { *(.text.lock) }	/* out-of-line lock text */
+
+  _etext = .;			/* End of text section */
+
+  . = ALIGN(16);		/* Exception table */
+  __start___ex_table = .;
+  __ex_table : { *(__ex_table) }
+  __stop___ex_table = .;
+
+  RODATA
+
+  .data : {			/* Data */
+	*(.data)
+	CONSTRUCTORS
+	}
+
+  _edata = .;			/* End of data section */
+
+  __bss_start = .;		/* BSS */
+  .bss : {
+	*(.bss.page_aligned)	
+	*(.bss)
+	}
+  __bss_end = .;
+
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+
+#define AFTER(x)      BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
+#define BINALIGN(x,y) (((x) + (y) - 1)  & ~((y) - 1))
+#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
+
+  .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
+  __vsyscall_0 = LOADADDR(.vsyscall_0);
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
+  xtime_lock = LOADADDR(.xtime_lock);
+  .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
+  vxtime = LOADADDR(.vxtime);
+  .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
+  wall_jiffies = LOADADDR(.wall_jiffies);
+  .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
+  sys_tz = LOADADDR(.sys_tz);
+  .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
+  sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); 
+  .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
+  xtime = LOADADDR(.xtime);
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
+  jiffies = LOADADDR(.jiffies);
+  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
+  . = LOADADDR(.vsyscall_0) + 4096;
+
+  . = ALIGN(8192);		/* init_task */
+  .data.init_task : { *(.data.init_task) }
+
+  . = ALIGN(4096);
+  .data.page_aligned : { *(.data.page_aligned) }
+
+  . = ALIGN(4096);		/* Init code and data */
+  __init_begin = .;
+  .init.text : { 
+	_sinittext = .;
+	*(.init.text)
+	_einittext = .;
+  }
+  __initdata_begin = .;
+  .init.data : { *(.init.data) }
+  __initdata_end = .;
+  . = ALIGN(16);
+  __setup_start = .;
+  .init.setup : { *(.init.setup) }
+  __setup_end = .;
+  __initcall_start = .;
+  .initcall.init : {
+	*(.initcall1.init) 
+	*(.initcall2.init) 
+	*(.initcall3.init) 
+	*(.initcall4.init) 
+	*(.initcall5.init) 
+	*(.initcall6.init) 
+	*(.initcall7.init)
+  }
+  __initcall_end = .;
+  __con_initcall_start = .;
+  .con_initcall.init : { *(.con_initcall.init) }
+  __con_initcall_end = .;
+  SECURITY_INIT
+  . = ALIGN(8);
+  __alt_instructions = .;
+  .altinstructions : { *(.altinstructions) } 
+  __alt_instructions_end = .; 
+ .altinstr_replacement : { *(.altinstr_replacement) }
+  /* .exit.text is discard at runtime, not link time, to deal with references
+     from .altinstructions and .eh_frame */
+  .exit.text : { *(.exit.text) }
+  .exit.data : { *(.exit.data) }	
+  . = ALIGN(4096);
+  __initramfs_start = .;
+  .init.ramfs : { *(.init.ramfs) }
+  __initramfs_end = .;	
+  . = ALIGN(32);
+  __per_cpu_start = .;
+  .data.percpu  : { *(.data.percpu) }
+  __per_cpu_end = .;
+  . = ALIGN(4096);
+  __init_end = .;
+
+  . = ALIGN(4096);
+  __nosave_begin = .;
+  .data_nosave : { *(.data.nosave) }
+  . = ALIGN(4096);
+  __nosave_end = .;
+
+  _end = . ;
+
+  /* Sections to be discarded */
+  /DISCARD/ : {
+	*(.exitcall.exit)
+#ifndef CONFIG_DEBUG_INFO
+	*(.eh_frame)
+#endif
+	}
+
+  /* DWARF 2 */
+  .debug_info     0 : { *(.debug_info) }
+  .debug_abbrev   0 : { *(.debug_abbrev) }
+  .debug_line     0 : { *(.debug_line) }
+  .debug_frame    0 : { *(.debug_frame) }
+  .debug_str      0 : { *(.debug_str) }
+  .debug_loc      0 : { *(.debug_loc) }
+  .debug_macinfo  0 : { *(.debug_macinfo) }
+  /* SGI/MIPS DWARF 2 extensions */
+  .debug_weaknames 0 : { *(.debug_weaknames) }
+  .debug_funcnames 0 : { *(.debug_funcnames) }
+  .debug_typenames 0 : { *(.debug_typenames) }
+  .debug_varnames  0 : { *(.debug_varnames) }
+
+
+  .comment 0 : { *(.comment) }
+}
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
new file mode 100644
index 000000000000..b4b8dc59663a
--- /dev/null
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -0,0 +1,225 @@
+/*
+ *  linux/arch/x86_64/kernel/vsyscall.c
+ *
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
+ *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ *  jumping out of line if necessary. We cannot add more with this
+ *  mechanism because older kernels won't return -ENOSYS.
+ *  If we want more than four we need a vDSO.
+ *
+ *  Note: the concept clashes with user mode linux. If you use UML and
+ *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define force_inline __attribute__((always_inline)) inline
+
+int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
+seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+
+#include <asm/unistd.h>
+
+static force_inline void timeval_normalize(struct timeval * tv)
+{
+	time_t __sec;
+
+	__sec = tv->tv_usec / 1000000;
+	if (__sec) {
+		tv->tv_usec %= 1000000;
+		tv->tv_sec += __sec;
+	}
+}
+
+static force_inline void do_vgettimeofday(struct timeval * tv)
+{
+	long sequence, t;
+	unsigned long sec, usec;
+
+	do {
+		sequence = read_seqbegin(&__xtime_lock);
+		
+		sec = __xtime.tv_sec;
+		usec = (__xtime.tv_nsec / 1000) +
+			(__jiffies - __wall_jiffies) * (1000000 / HZ);
+
+		if (__vxtime.mode == VXTIME_TSC) {
+			sync_core();
+			rdtscll(t);
+			if (t < __vxtime.last_tsc)
+				t = __vxtime.last_tsc;
+			usec += ((t - __vxtime.last_tsc) *
+				 __vxtime.tsc_quot) >> 32;
+			/* See comment in x86_64 do_gettimeofday. */
+		} else {
+			usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
+				  __vxtime.last) * __vxtime.quot) >> 32;
+		}
+	} while (read_seqretry(&__xtime_lock, sequence));
+
+	tv->tv_sec = sec + usec / 1000000;
+	tv->tv_usec = usec % 1000000;
+}
+
+/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
+static force_inline void do_get_tz(struct timezone * tz)
+{
+	*tz = __sys_tz;
+}
+
+static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	int ret;
+	asm volatile("vsysc2: syscall"
+		: "=a" (ret)
+		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
+	return ret;
+}
+
+static force_inline long time_syscall(long *t)
+{
+	long secs;
+	asm volatile("vsysc1: syscall"
+		: "=a" (secs)
+		: "0" (__NR_time),"D" (t) : __syscall_clobber);
+	return secs;
+}
+
+static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+{
+	if (unlikely(!__sysctl_vsyscall))
+		return gettimeofday(tv,tz);
+	if (tv)
+		do_vgettimeofday(tv);
+	if (tz)
+		do_get_tz(tz);
+	return 0;
+}
+
+/* This will break when the xtime seconds get inaccurate, but that is
+ * unlikely */
+static time_t __vsyscall(1) vtime(time_t *t)
+{
+	if (unlikely(!__sysctl_vsyscall))
+		return time_syscall(t);
+	else if (t)
+		*t = __xtime.tv_sec;		
+	return __xtime.tv_sec;
+}
+
+static long __vsyscall(2) venosys_0(void)
+{
+	return -ENOSYS;
+}
+
+static long __vsyscall(3) venosys_1(void)
+{
+	return -ENOSYS;
+}
+
+#ifdef CONFIG_SYSCTL
+
+#define SYSCALL 0x050f
+#define NOP2    0x9090
+
+/*
+ * NOP out syscall in vsyscall page when not needed.
+ */
+static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	extern u16 vsysc1, vsysc2;
+	u16 *map1, *map2;
+	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	if (!write)
+		return ret;
+	/* gcc has some trouble with __va(__pa()), so just do it this
+	   way. */
+	map1 = ioremap(__pa_symbol(&vsysc1), 2);
+	if (!map1)
+		return -ENOMEM;
+	map2 = ioremap(__pa_symbol(&vsysc2), 2);
+	if (!map2) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	if (!sysctl_vsyscall) {
+		*map1 = SYSCALL;
+		*map2 = SYSCALL;
+	} else {
+		*map1 = NOP2;
+		*map2 = NOP2;
+	}
+	iounmap(map2);
+out:
+	iounmap(map1);
+	return ret;
+}
+
+static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
+				void __user *oldval, size_t __user *oldlenp,
+				void __user *newval, size_t newlen,
+				void **context)
+{
+	return -ENOSYS;
+}
+
+static ctl_table kernel_table2[] = {
+	{ .ctl_name = 99, .procname = "vsyscall64",
+	  .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
+	  .strategy = vsyscall_sysctl_nostrat,
+	  .proc_handler = vsyscall_sysctl_change },
+	{ 0, }
+};
+
+static ctl_table kernel_root_table2[] = {
+	{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
+	  .child = kernel_table2 },
+	{ 0 },
+};
+
+#endif
+
+static void __init map_vsyscall(void)
+{
+	extern char __vsyscall_0;
+	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+
+	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+}
+
+static int __init vsyscall_init(void)
+{
+	BUG_ON(((unsigned long) &vgettimeofday !=
+			VSYSCALL_ADDR(__NR_vgettimeofday)));
+	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
+	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+	map_vsyscall();
+	sysctl_vsyscall = 1;
+	register_sysctl_table(kernel_root_table2, 0);
+	return 0;
+}
+
+__initcall(vsyscall_init);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
new file mode 100644
index 000000000000..88626e626886
--- /dev/null
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -0,0 +1,221 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/user.h>
+#include <linux/sched.h>
+#include <linux/in6.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/pm.h>
+#include <linux/pci.h>
+#include <linux/apm_bios.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/syscalls.h>
+#include <linux/tty.h>
+#include <linux/ioctl32.h>
+
+#include <asm/semaphore.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/uaccess.h>
+#include <asm/checksum.h>
+#include <asm/io.h>
+#include <asm/delay.h>
+#include <asm/irq.h>
+#include <asm/mmx.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/nmi.h>
+#include <asm/kdebug.h>
+#include <asm/unistd.h>
+#include <asm/tlbflush.h>
+#include <asm/kdebug.h>
+
+extern spinlock_t rtc_lock;
+
+#ifdef CONFIG_SMP
+extern void __write_lock_failed(rwlock_t *rw);
+extern void __read_lock_failed(rwlock_t *rw);
+#endif
+
+#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
+extern struct drive_info_struct drive_info;
+EXPORT_SYMBOL(drive_info);
+#endif
+
+extern unsigned long get_cmos_time(void);
+
+/* platform dependent support */
+EXPORT_SYMBOL(boot_cpu_data);
+//EXPORT_SYMBOL(dump_fpu);
+EXPORT_SYMBOL(__ioremap);
+EXPORT_SYMBOL(ioremap_nocache);
+EXPORT_SYMBOL(iounmap);
+EXPORT_SYMBOL(enable_irq);
+EXPORT_SYMBOL(disable_irq);
+EXPORT_SYMBOL(disable_irq_nosync);
+EXPORT_SYMBOL(probe_irq_mask);
+EXPORT_SYMBOL(kernel_thread);
+EXPORT_SYMBOL(pm_idle);
+EXPORT_SYMBOL(pm_power_off);
+EXPORT_SYMBOL(get_cmos_time);
+
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__down_failed_trylock);
+EXPORT_SYMBOL(__up_wakeup);
+/* Networking helper routines. */
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+EXPORT_SYMBOL(ip_compute_csum);
+/* Delay loops */
+EXPORT_SYMBOL(__udelay);
+EXPORT_SYMBOL(__ndelay);
+EXPORT_SYMBOL(__delay);
+EXPORT_SYMBOL(__const_udelay);
+
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+
+EXPORT_SYMBOL(strpbrk);
+EXPORT_SYMBOL(strstr);
+
+EXPORT_SYMBOL(strncpy_from_user);
+EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(clear_user);
+EXPORT_SYMBOL(__clear_user);
+EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(copy_in_user);
+EXPORT_SYMBOL(strnlen_user);
+
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_alloc_consistent);
+EXPORT_SYMBOL(pci_free_consistent);
+#endif
+
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(clear_page);
+
+EXPORT_SYMBOL(cpu_pda);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(cpu_data);
+EXPORT_SYMBOL(cpu_online_map);
+EXPORT_SYMBOL(__write_lock_failed);
+EXPORT_SYMBOL(__read_lock_failed);
+
+EXPORT_SYMBOL(synchronize_irq);
+EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(cpu_callout_map);
+#endif
+
+#ifdef CONFIG_VT
+EXPORT_SYMBOL(screen_info);
+#endif
+
+EXPORT_SYMBOL(get_wchan);
+
+EXPORT_SYMBOL(rtc_lock);
+
+EXPORT_SYMBOL_GPL(set_nmi_callback);
+EXPORT_SYMBOL_GPL(unset_nmi_callback);
+
+/* Export string functions. We normally rely on gcc builtin for most of these,
+   but gcc sometimes decides not to inline them. */    
+#undef memcpy
+#undef memset
+#undef memmove
+#undef memchr
+#undef strlen
+#undef strcpy
+#undef strncmp
+#undef strncpy
+#undef strchr	
+#undef strcmp 
+#undef strcpy 
+#undef strcat
+#undef memcmp
+
+extern void * memset(void *,int,__kernel_size_t);
+extern size_t strlen(const char *);
+extern void * memmove(void * dest,const void *src,size_t count);
+extern char * strcpy(char * dest,const char *src);
+extern int strcmp(const char * cs,const char * ct);
+extern void *memchr(const void *s, int c, size_t n);
+extern void * memcpy(void *,const void *,__kernel_size_t);
+extern void * __memcpy(void *,const void *,__kernel_size_t);
+extern char * strcat(char *, const char *);
+extern int memcmp(const void * cs,const void * ct,size_t count);
+
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(memmove);
+EXPORT_SYMBOL(strcpy);
+EXPORT_SYMBOL(strncmp);
+EXPORT_SYMBOL(strncpy);
+EXPORT_SYMBOL(strchr);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strcat);
+EXPORT_SYMBOL(strncat);
+EXPORT_SYMBOL(memchr);
+EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(memscan);
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memcmp);
+
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+/* prototypes are wrong, these are assembly with custom calling functions */
+extern void rwsem_down_read_failed_thunk(void);
+extern void rwsem_wake_thunk(void);
+extern void rwsem_downgrade_thunk(void);
+extern void rwsem_down_write_failed_thunk(void);
+EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
+EXPORT_SYMBOL(rwsem_wake_thunk);
+EXPORT_SYMBOL(rwsem_downgrade_thunk);
+EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
+#endif
+
+EXPORT_SYMBOL(empty_zero_page);
+
+#ifdef CONFIG_HAVE_DEC_LOCK
+EXPORT_SYMBOL(_atomic_dec_and_lock);
+#endif
+
+EXPORT_SYMBOL(die_chain);
+EXPORT_SYMBOL(register_die_notifier);
+
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(cpu_sibling_map);
+EXPORT_SYMBOL(smp_num_siblings);
+#endif
+
+extern void do_softirq_thunk(void);
+EXPORT_SYMBOL(do_softirq_thunk);
+
+void out_of_line_bug(void);
+EXPORT_SYMBOL(out_of_line_bug);
+
+EXPORT_SYMBOL(init_level4_pgt);
+
+extern unsigned long __supported_pte_mask;
+EXPORT_SYMBOL(__supported_pte_mask);
+
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(flush_tlb_page);
+#endif
+
+EXPORT_SYMBOL(cpu_khz);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
new file mode 100644
index 000000000000..6b26a1c1e9ff
--- /dev/null
+++ b/arch/x86_64/lib/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for x86_64-specific library files.
+#
+
+CFLAGS_csum-partial.o := -funroll-loops
+
+obj-y := io.o
+
+lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
+	usercopy.o getuser.o putuser.o  \
+	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o
+
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
new file mode 100644
index 000000000000..a29fb75b33ac
--- /dev/null
+++ b/arch/x86_64/lib/bitops.c
@@ -0,0 +1,141 @@
+#include <linux/bitops.h>
+
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+	long d0, d1, d2;
+	long res;
+
+	if (!size)
+		return 0;
+	asm volatile(
+		"  repe; scasq\n"
+		"  je 1f\n"
+		"  xorq -8(%%rdi),%%rax\n"
+		"  subq $8,%%rdi\n"
+		"  bsfq %%rax,%%rdx\n"
+		"1:  subq %[addr],%%rdi\n"
+		"  shlq $3,%%rdi\n"
+		"  addq %%rdi,%%rdx"
+		:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+		:"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
+		 [addr] "r" (addr) : "memory");
+	return res;
+}
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+	unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
+	unsigned long set = 0;
+	unsigned long res, bit = offset&63;
+
+	if (bit) {
+		/*
+		 * Look for zero in first word
+		 */
+		asm("bsfq %1,%0\n\t"
+		    "cmoveq %2,%0"
+		    : "=r" (set)
+		    : "r" (~(*p >> bit)), "r"(64L));
+		if (set < (64 - bit))
+			return set + offset;
+		set = 64 - bit;
+		p++;
+	}
+	/*
+	 * No zero yet, search remaining full words for a zero
+	 */
+	res = find_first_zero_bit ((const unsigned long *)p,
+				   size - 64 * (p - (unsigned long *) addr));
+	return (offset + set + res);
+}
+
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+	long d0, d1;
+	long res;
+
+	asm volatile(
+		"   repe; scasq\n"
+		"   jz 1f\n"
+		"   subq $8,%%rdi\n"
+		"   bsfq (%%rdi),%%rax\n"
+		"1: subq %[addr],%%rdi\n"
+		"   shlq $3,%%rdi\n"
+		"   addq %%rdi,%%rax"
+		:"=a" (res), "=&c" (d0), "=&D" (d1)
+		:"0" (0ULL),
+		 "1" ((size + 63) >> 6), "2" (addr),
+		 [addr] "r" (addr) : "memory");
+	return res;
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+	return __find_first_bit(addr,size);
+}
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+	const unsigned long * p = addr + (offset >> 6);
+	unsigned long set = 0, bit = offset & 63, res;
+
+	if (bit) {
+		/*
+		 * Look for nonzero in the first 64 bits:
+		 */
+		asm("bsfq %1,%0\n\t"
+		    "cmoveq %2,%0\n\t"
+		    : "=r" (set)
+		    : "r" (*p >> bit), "r" (64L));
+		if (set < (64 - bit))
+			return set + offset;
+		set = 64 - bit;
+		p++;
+	}
+	/*
+	 * No set bit yet, search remaining full words for a bit
+	 */
+	res = __find_first_bit (p, size - 64 * (p - addr));
+	return (offset + set + res);
+}
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86_64/lib/bitstr.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+/* Find string of zero bits in a bitmap */ 
+unsigned long 
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{ 
+	unsigned long n, end, i; 	
+
+ again:
+	n = find_next_zero_bit(bitmap, nbits, start);
+	if (n == -1) 
+		return -1;
+	
+	/* could test bitsliced, but it's hardly worth it */
+	end = n+len;
+	if (end >= nbits) 
+		return -1; 
+	for (i = n+1; i < end; i++) { 
+		if (test_bit(i, bitmap)) {  
+			start = i+1; 
+			goto again; 
+		} 
+	}
+	return n;
+}
+
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
new file mode 100644
index 000000000000..30a9da458c15
--- /dev/null
+++ b/arch/x86_64/lib/clear_page.S
@@ -0,0 +1,50 @@
+/*
+ * Zero a page. 	
+ * rdi	page
+ */			
+	.globl clear_page
+	.p2align 4
+clear_page:
+	xorl   %eax,%eax
+	movl   $4096/64,%ecx
+	.p2align 4
+.Lloop:
+	decl	%ecx
+#define PUT(x) movq %rax,x*8(%rdi) 
+	movq %rax,(%rdi)
+	PUT(1)
+	PUT(2)
+	PUT(3)
+	PUT(4)
+	PUT(5)
+	PUT(6)
+	PUT(7)
+	leaq	64(%rdi),%rdi
+	jnz	.Lloop
+	nop
+	ret
+clear_page_end:	
+	
+	/* C stepping K8 run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+	
+#include <asm/cpufeature.h>
+	    	
+	.section .altinstructions,"a"
+	.align 8
+	.quad  clear_page
+	.quad  clear_page_c
+	.byte  X86_FEATURE_K8_C
+	.byte  clear_page_end-clear_page	
+	.byte  clear_page_c_end-clear_page_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+clear_page_c:
+	movl $4096/8,%ecx
+	xorl %eax,%eax
+	rep 
+	stosq
+	ret
+clear_page_c_end:
+	.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
new file mode 100644
index 000000000000..dd3aa47b6bf5
--- /dev/null
+++ b/arch/x86_64/lib/copy_page.S
@@ -0,0 +1,101 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+	
+/* Don't use streaming store because it's better when the target
+   ends up in cache. */
+	    
+/* Could vary the prefetch distance based on SMP/UP */
+
+	.globl copy_page
+	.p2align 4
+copy_page:
+	subq	$3*8,%rsp
+	movq	%rbx,(%rsp)
+	movq	%r12,1*8(%rsp)
+	movq	%r13,2*8(%rsp)
+			
+	movl	$(4096/64)-5,%ecx
+	.p2align 4
+.Loop64:	
+  	dec     %rcx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	prefetcht0 5*64(%rsi)
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+
+	leaq    64 (%rsi), %rsi
+	leaq    64 (%rdi), %rdi
+
+	jnz     .Loop64
+
+	movl	$5,%ecx
+	.p2align 4
+.Loop2:	
+	decl   %ecx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+	
+	leaq	64(%rdi),%rdi			
+	leaq	64(%rsi),%rsi			
+	
+	jnz	.Loop2		
+	
+	movq	(%rsp),%rbx
+	movq	1*8(%rsp),%r12
+	movq	2*8(%rsp),%r13
+	addq	$3*8,%rsp
+	ret
+	
+	/* C stepping K8 run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>		
+		
+	.section .altinstructions,"a"
+	.align 8
+	.quad  copy_page
+	.quad  copy_page_c
+	.byte  X86_FEATURE_K8_C
+	.byte  copy_page_c_end-copy_page_c
+	.byte  copy_page_c_end-copy_page_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+copy_page_c:
+	movl $4096/8,%ecx
+	rep 
+	movsq 
+	ret
+copy_page_c_end:
+	.previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
new file mode 100644
index 000000000000..bd556c804248
--- /dev/null
+++ b/arch/x86_64/lib/copy_user.S
@@ -0,0 +1,294 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ * 
+ * Functions to copy from and to user space.		
+ */		 
+
+#define FIX_ALIGNMENT 1
+		
+	#include <asm/current.h>
+	#include <asm/offset.h>
+	#include <asm/thread_info.h>
+	#include <asm/cpufeature.h>
+
+/* Standard copy_to_user with segment limit checking */		
+	.globl copy_to_user
+	.p2align 4	
+copy_to_user:
+	GET_THREAD_INFO(%rax)
+	movq %rdi,%rcx
+	addq %rdx,%rcx
+	jc  bad_to_user
+	cmpq threadinfo_addr_limit(%rax),%rcx
+	jae bad_to_user
+2:	
+	.byte 0xe9	/* 32bit jump */
+	.long .Lcug-1f
+1:
+
+	.section .altinstr_replacement,"ax"
+3:	.byte 0xe9			/* replacement jmp with 8 bit immediate */
+	.long copy_user_generic_c-1b	/* offset */
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad  2b
+	.quad  3b
+	.byte  X86_FEATURE_K8_C
+	.byte  5
+	.byte  5
+	.previous
+
+/* Standard copy_from_user with segment limit checking */	
+	.globl copy_from_user
+	.p2align 4	
+copy_from_user:
+	GET_THREAD_INFO(%rax)
+	movq %rsi,%rcx
+	addq %rdx,%rcx
+	jc  bad_from_user
+	cmpq threadinfo_addr_limit(%rax),%rcx
+	jae  bad_from_user
+	/* FALL THROUGH to copy_user_generic */
+	
+	.section .fixup,"ax"
+	/* must zero dest */
+bad_from_user:
+	movl %edx,%ecx
+	xorl %eax,%eax
+	rep
+	stosb
+bad_to_user:
+	movl	%edx,%eax
+	ret
+	.previous
+	
+		
+/*
+ * copy_user_generic - memory copy with exception handling.
+ * 	
+ * Input:	
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:		
+ * eax uncopied bytes or 0 if successful.
+ */
+	.globl copy_user_generic	
+	.p2align 4
+copy_user_generic:	
+	.byte 0x66,0x66,0x90	/* 5 byte nop for replacement jump */	
+	.byte 0x66,0x90
+1:		
+	.section .altinstr_replacement,"ax"
+2:	.byte 0xe9	             /* near jump with 32bit immediate */
+	.long copy_user_generic_c-1b /* offset */
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad  copy_user_generic
+	.quad  2b
+	.byte  X86_FEATURE_K8_C
+	.byte  5
+	.byte  5
+	.previous
+.Lcug:	
+	pushq %rbx
+	xorl %eax,%eax		/*zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+	movq %rdx,%rcx
+
+	movl $64,%ebx	
+	shrq $6,%rdx
+	decq %rdx
+	js   .Lhandle_tail
+	
+	.p2align 4
+.Lloop:
+.Ls1:	movq (%rsi),%r11
+.Ls2:	movq 1*8(%rsi),%r8
+.Ls3:	movq 2*8(%rsi),%r9
+.Ls4:	movq 3*8(%rsi),%r10
+.Ld1:	movq %r11,(%rdi)
+.Ld2:	movq %r8,1*8(%rdi)
+.Ld3:	movq %r9,2*8(%rdi)
+.Ld4:	movq %r10,3*8(%rdi)
+		
+.Ls5:	movq 4*8(%rsi),%r11
+.Ls6:	movq 5*8(%rsi),%r8
+.Ls7:	movq 6*8(%rsi),%r9
+.Ls8:	movq 7*8(%rsi),%r10
+.Ld5:	movq %r11,4*8(%rdi)
+.Ld6:	movq %r8,5*8(%rdi)
+.Ld7:	movq %r9,6*8(%rdi)
+.Ld8:	movq %r10,7*8(%rdi)
+	
+	decq %rdx
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	
+	jns  .Lloop
+
+	.p2align 4
+.Lhandle_tail:
+	movl %ecx,%edx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	movl $8,%ebx
+	.p2align 4
+.Lloop_8:
+.Ls9:	movq (%rsi),%r8
+.Ld9:	movq %r8,(%rdi)
+	decl %ecx
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz .Lloop_8
+	
+.Lhandle_7:		
+	movl %edx,%ecx	
+	andl $7,%ecx
+	jz   .Lende
+	.p2align 4
+.Lloop_1:
+.Ls10:	movb (%rsi),%bl
+.Ld10:	movb %bl,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+			
+.Lende:
+	popq %rbx
+	ret	
+
+#ifdef FIX_ALIGNMENT		  		
+	/* align destination */
+	.p2align 4
+.Lbad_alignment:
+	movl $8,%r9d
+	subl %ecx,%r9d
+	movl %r9d,%ecx
+	cmpq %r9,%rdx
+	jz   .Lhandle_7
+	js   .Lhandle_7
+.Lalign_1:		
+.Ls11:	movb (%rsi),%bl
+.Ld11:	movb %bl,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .Lalign_1
+	subq %r9,%rdx
+	jmp .Lafter_bad_alignment
+#endif
+	
+	/* table sorted by exception address */	
+	.section __ex_table,"a"
+	.align 8
+	.quad .Ls1,.Ls1e
+	.quad .Ls2,.Ls2e
+	.quad .Ls3,.Ls3e
+	.quad .Ls4,.Ls4e	
+	.quad .Ld1,.Ls1e
+	.quad .Ld2,.Ls2e
+	.quad .Ld3,.Ls3e
+	.quad .Ld4,.Ls4e
+	.quad .Ls5,.Ls5e
+	.quad .Ls6,.Ls6e
+	.quad .Ls7,.Ls7e
+	.quad .Ls8,.Ls8e	
+	.quad .Ld5,.Ls5e
+	.quad .Ld6,.Ls6e
+	.quad .Ld7,.Ls7e
+	.quad .Ld8,.Ls8e
+	.quad .Ls9,.Le_quad
+	.quad .Ld9,.Le_quad
+	.quad .Ls10,.Le_byte
+	.quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT	
+	.quad .Ls11,.Lzero_rest
+	.quad .Ld11,.Lzero_rest
+#endif
+	.quad .Le5,.Le_zero
+	.previous
+
+	/* compute 64-offset for main loop. 8 bytes accuracy with error on the 
+	   pessimistic side. this is gross. it would be better to fix the 
+	   interface. */	
+	/* eax: zero, ebx: 64 */
+.Ls1e: 	addl $8,%eax
+.Ls2e: 	addl $8,%eax
+.Ls3e: 	addl $8,%eax
+.Ls4e: 	addl $8,%eax
+.Ls5e: 	addl $8,%eax
+.Ls6e: 	addl $8,%eax
+.Ls7e: 	addl $8,%eax
+.Ls8e: 	addl $8,%eax
+	addq %rbx,%rdi	/* +64 */
+	subq %rax,%rdi  /* correct destination with computed offset */
+
+	shlq $6,%rdx	/* loop counter * 64 (stride length) */
+	addq %rax,%rdx	/* add offset to loopcnt */
+	andl $63,%ecx	/* remaining bytes */
+	addq %rcx,%rdx	/* add them */
+	jmp .Lzero_rest
+
+	/* exception on quad word loop in tail handling */
+	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+	shll $3,%ecx
+	andl $7,%edx
+	addl %ecx,%edx
+	/* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+	movq %rdx,%rcx
+.Le_byte:
+	xorl %eax,%eax
+.Le5:	rep 
+	stosb
+	/* when there is another exception while zeroing the rest just return */
+.Le_zero:		
+	movq %rdx,%rax
+	jmp .Lende
+
+	/* C stepping K8 run faster using the string copy instructions.
+	   This is also a lot simpler. Use them when possible.
+	   Patch in jmps to this code instead of copying it fully
+	   to avoid unwanted aliasing in the exception tables. */
+		
+ /* rdi	destination
+  * rsi source
+  * rdx count
+  *
+  * Output:		
+  * eax uncopied bytes or 0 if successfull.
+  */			
+copy_user_generic_c:
+	movl %edx,%ecx
+	shrl $3,%ecx
+	andl $7,%edx	
+1:	rep 
+	movsq 
+	movl %edx,%ecx
+2:	rep
+	movsb
+4:	movl %ecx,%eax
+	ret
+3:	lea (%rdx,%rcx,8),%rax
+	ret
+	
+	.section __ex_table,"a"
+	.quad 1b,3b
+	.quad 2b,4b
+	.previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
new file mode 100644
index 000000000000..01808ec37836
--- /dev/null
+++ b/arch/x86_64/lib/csum-copy.S
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *	
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+ 	#include <linux/linkage.h>
+	#include <asm/errno.h>
+
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * destination is zeroed.
+ * 
+ * Input
+ * rdi  source
+ * rsi  destination
+ * edx  len (32bit)
+ * ecx  sum (32bit) 
+ * r8   src_err_ptr (int)
+ * r9   dst_err_ptr (int)
+ *
+ * Output
+ * eax  64bit sum. undefined in case of exception.
+ * 
+ * Wrappers need to take care of valid exception sum and zeroing.		 
+ * They also should align source or destination to 8 bytes.
+ */
+
+	.macro source
+10:
+	.section __ex_table,"a"
+	.align 8
+	.quad 10b,.Lbad_source
+	.previous
+	.endm
+		
+	.macro dest
+20:
+	.section __ex_table,"a"
+	.align 8
+	.quad 20b,.Lbad_dest
+	.previous
+	.endm
+			
+	.macro ignore L=.Lignore
+30:
+	.section __ex_table,"a"
+	.align 8
+	.quad 30b,\L
+	.previous
+	.endm
+	
+				
+	.globl csum_partial_copy_generic
+	.p2align 4
+csum_partial_copy_generic:
+	cmpl	 $3*64,%edx
+	jle	 .Lignore
+
+.Lignore:		
+	subq  $7*8,%rsp
+	movq  %rbx,2*8(%rsp)
+	movq  %r12,3*8(%rsp)
+	movq  %r14,4*8(%rsp)
+	movq  %r13,5*8(%rsp)
+	movq  %rbp,6*8(%rsp)
+
+	movq  %r8,(%rsp)
+	movq  %r9,1*8(%rsp)
+	
+	movl  %ecx,%eax
+	movl  %edx,%ecx
+
+	xorl  %r9d,%r9d
+	movq  %rcx,%r12
+
+	shrq  $6,%r12
+	jz    .Lhandle_tail       /* < 64 */
+
+	clc
+	
+	/* main loop. clear in 64 byte blocks */
+	/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+	/* r11:	temp3, rdx: temp4, r12 loopcnt */
+	/* r10:	temp5, rbp: temp6, r14 temp7, r13 temp8 */
+	.p2align 4
+.Lloop:
+	source
+	movq  (%rdi),%rbx
+	source
+	movq  8(%rdi),%r8
+	source
+	movq  16(%rdi),%r11
+	source
+	movq  24(%rdi),%rdx
+
+	source
+	movq  32(%rdi),%r10
+	source
+	movq  40(%rdi),%rbp
+	source
+	movq  48(%rdi),%r14
+	source
+	movq  56(%rdi),%r13
+		
+	ignore 2f
+	prefetcht0 5*64(%rdi)
+2:							
+	adcq  %rbx,%rax
+	adcq  %r8,%rax
+	adcq  %r11,%rax
+	adcq  %rdx,%rax
+	adcq  %r10,%rax
+	adcq  %rbp,%rax
+	adcq  %r14,%rax
+	adcq  %r13,%rax
+
+	decl %r12d
+	
+	dest
+	movq %rbx,(%rsi)
+	dest
+	movq %r8,8(%rsi)
+	dest
+	movq %r11,16(%rsi)
+	dest
+	movq %rdx,24(%rsi)
+
+	dest
+	movq %r10,32(%rsi)
+	dest
+	movq %rbp,40(%rsi)
+	dest
+	movq %r14,48(%rsi)
+	dest
+	movq %r13,56(%rsi)
+	
+3:
+	
+	leaq 64(%rdi),%rdi
+	leaq 64(%rsi),%rsi
+
+	jnz   .Lloop
+
+	adcq  %r9,%rax
+
+	/* do last upto 56 bytes */
+.Lhandle_tail:
+	/* ecx:	count */
+	movl %ecx,%r10d
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz 	 .Lfold
+	clc
+	.p2align 4
+.Lloop_8:	
+	source
+	movq (%rdi),%rbx
+	adcq %rbx,%rax
+	decl %ecx
+	dest
+	movq %rbx,(%rsi)
+	leaq 8(%rsi),%rsi /* preserve carry */
+	leaq 8(%rdi),%rdi
+	jnz	.Lloop_8
+	adcq %r9,%rax	/* add in carry */
+
+.Lfold:
+	/* reduce checksum to 32bits */
+	movl %eax,%ebx
+	shrq $32,%rax
+	addl %ebx,%eax
+	adcl %r9d,%eax
+
+	/* do last upto 6 bytes */	
+.Lhandle_7:
+	movl %r10d,%ecx
+	andl $7,%ecx
+	shrl $1,%ecx
+	jz   .Lhandle_1
+	movl $2,%edx
+	xorl %ebx,%ebx
+	clc  
+	.p2align 4
+.Lloop_1:	
+	source
+	movw (%rdi),%bx
+	adcl %ebx,%eax
+	dest
+	decl %ecx
+	movw %bx,(%rsi)
+	leaq 2(%rdi),%rdi
+	leaq 2(%rsi),%rsi
+	jnz .Lloop_1
+	adcl %r9d,%eax	/* add in carry */
+	
+	/* handle last odd byte */
+.Lhandle_1:
+	testl $1,%r10d
+	jz    .Lende
+	xorl  %ebx,%ebx
+	source
+	movb (%rdi),%bl
+	dest
+	movb %bl,(%rsi)
+	addl %ebx,%eax
+	adcl %r9d,%eax		/* carry */
+			
+.Lende:
+	movq 2*8(%rsp),%rbx
+	movq 3*8(%rsp),%r12
+	movq 4*8(%rsp),%r14
+	movq 5*8(%rsp),%r13
+	movq 6*8(%rsp),%rbp
+	addq $7*8,%rsp
+	ret
+
+	/* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+	movq (%rsp),%rax
+	testq %rax,%rax
+	jz   .Lende
+	movl $-EFAULT,(%rax)
+	jmp  .Lende
+	
+.Lbad_dest:
+	movq 8(%rsp),%rax
+	testq %rax,%rax
+	jz   .Lende	
+	movl $-EFAULT,(%rax)
+	jmp .Lende
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
new file mode 100644
index 000000000000..5384e227cdf6
--- /dev/null
+++ b/arch/x86_64/lib/csum-partial.c
@@ -0,0 +1,150 @@
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+ 
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+
+#define __force_inline inline __attribute__((always_inline))
+
+static inline unsigned short from32to16(unsigned a) 
+{
+	unsigned short b = a >> 16; 
+	asm("addw %w2,%w0\n\t"
+	    "adcw $0,%w0\n" 
+	    : "=r" (b)
+	    : "0" (b), "r" (a));
+	return b;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ * 
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+	unsigned odd, count;
+	unsigned long result = 0;
+
+	if (unlikely(len == 0))
+		return result; 
+	odd = 1 & (unsigned long) buff;
+	if (unlikely(odd)) {
+		result = *buff << 8;
+		len--;
+		buff++;
+	}
+	count = len >> 1;		/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *)buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;		/* nr of 32-bit words.. */
+		if (count) {
+			unsigned long zero;
+			unsigned count64;
+			if (4 & (unsigned long) buff) {
+				result += *(unsigned int *) buff;
+				count--;
+				len -= 4;
+				buff += 4;
+			}
+			count >>= 1;	/* nr of 64-bit words.. */
+
+			/* main loop using 64byte blocks */
+			zero = 0;
+			count64 = count >> 3;
+			while (count64) { 
+				asm("addq 0*8(%[src]),%[res]\n\t"
+				    "adcq 1*8(%[src]),%[res]\n\t"
+				    "adcq 2*8(%[src]),%[res]\n\t"
+				    "adcq 3*8(%[src]),%[res]\n\t"
+				    "adcq 4*8(%[src]),%[res]\n\t"
+				    "adcq 5*8(%[src]),%[res]\n\t"
+				    "adcq 6*8(%[src]),%[res]\n\t"
+				    "adcq 7*8(%[src]),%[res]\n\t"
+				    "adcq %[zero],%[res]"
+				    : [res] "=r" (result)
+				    : [src] "r" (buff), [zero] "r" (zero),
+				    "[res]" (result));
+				buff += 64;
+				count64--;
+			}
+
+			/* last upto 7 8byte blocks */
+			count %= 8; 
+			while (count) { 
+				asm("addq %1,%0\n\t"
+				    "adcq %2,%0\n" 
+					    : "=r" (result)
+				    : "m" (*(unsigned long *)buff), 
+				    "r" (zero),  "0" (result));
+				--count; 
+					buff += 8;
+			}
+			result = add32_with_carry(result>>32,
+						  result&0xffffffff); 
+
+			if (len & 4) {
+				result += *(unsigned int *) buff;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+	result = add32_with_carry(result>>32, result & 0xffffffff); 
+	if (unlikely(odd)) { 
+		result = from32to16(result);
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+	}
+	return result;
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum)
+{
+	return add32_with_carry(do_csum(buff, len), sum); 
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+	return csum_fold(csum_partial(buff,len,0));
+}
+
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c
new file mode 100644
index 000000000000..94323f20816e
--- /dev/null
+++ b/arch/x86_64/lib/csum-wrappers.c
@@ -0,0 +1,129 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ * 
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+
+#include <asm/checksum.h>
+#include <linux/module.h>
+
+/** 
+ * csum_partial_copy_from_user - Copy and checksum from user space. 
+ * @src: source address (user space) 
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits. 
+ */ 
+unsigned int 
+csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst,
+			    int len, unsigned int isum, int *errp)
+{ 
+	might_sleep();
+	*errp = 0;
+	if (likely(access_ok(VERIFY_READ,src, len))) { 
+		/* Why 6, not 7? To handle odd addresses aligned we
+		   would need to do considerable complications to fix the
+		   checksum which is defined as an 16bit accumulator. The
+		   fix alignment code is primarily for performance
+		   compatibility with 32bit and that will handle odd
+		   addresses slowly too. */
+		if (unlikely((unsigned long)src & 6)) {			
+			while (((unsigned long)src & 6) && len >= 2) { 
+				__u16 val16;			
+				*errp = __get_user(val16, (__u16 __user *)src); 
+				if (*errp)
+					return isum;
+				*(__u16 *)dst = val16;
+				isum = add32_with_carry(isum, val16); 
+				src += 2; 
+				dst += 2; 
+				len -= 2;
+			}
+		}
+		isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL);
+		if (likely(*errp == 0)) 
+			return isum;
+	} 
+	*errp = -EFAULT;
+	memset(dst,0,len); 
+	return isum;		
+} 
+
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+/** 
+ * csum_partial_copy_to_user - Copy and checksum to user space. 
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */ 
+unsigned int 
+csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst,
+			  int len, unsigned int isum, int *errp)
+{ 
+	might_sleep();
+	if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+		*errp = -EFAULT;
+		return 0; 
+	}
+
+	if (unlikely((unsigned long)dst & 6)) {
+		while (((unsigned long)dst & 6) && len >= 2) { 
+			__u16 val16 = *(__u16 *)src;
+			isum = add32_with_carry(isum, val16);
+			*errp = __put_user(val16, (__u16 __user *)dst);
+			if (*errp)
+				return isum;
+			src += 2; 
+			dst += 2; 
+			len -= 2;
+		}
+	}
+
+	*errp = 0;
+	return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); 
+} 
+
+EXPORT_SYMBOL(csum_partial_copy_to_user);
+
+/** 
+ * csum_partial_copy_nocheck - Copy and checksum.
+ * @src: source address
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ */ 
+unsigned int 
+csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, unsigned int sum)
+{ 
+	return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
+} 
+
+unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr,
+			       __u32 len, unsigned short proto, unsigned int sum) 
+{
+	__u64 rest, sum64;
+     
+	rest = (__u64)htonl(len) + (__u64)htons(proto) + (__u64)sum;
+	asm("  addq (%[saddr]),%[sum]\n"
+	    "  adcq 8(%[saddr]),%[sum]\n"
+	    "  adcq (%[daddr]),%[sum]\n" 
+	    "  adcq 8(%[daddr]),%[sum]\n"
+	    "  adcq $0,%[sum]\n"
+	    : [sum] "=r" (sum64) 
+	    : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
+	return csum_fold(add32_with_carry(sum64 & 0xffffffff, sum64>>32));
+}
+
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86_64/lib/dec_and_lock.c b/arch/x86_64/lib/dec_and_lock.c
new file mode 100644
index 000000000000..ab43394dc775
--- /dev/null
+++ b/arch/x86_64/lib/dec_and_lock.c
@@ -0,0 +1,40 @@
+/*
+ * x86 version of "atomic_dec_and_lock()" using
+ * the atomic "cmpxchg" instruction.
+ *
+ * (For CPU's lacking cmpxchg, we use the slow
+ * generic version, and this one never even gets
+ * compiled).
+ */
+
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	int counter;
+	int newcount;
+
+repeat:
+	counter = atomic_read(atomic);
+	newcount = counter-1;
+
+	if (!newcount)
+		goto slow_path;
+
+	asm volatile("lock; cmpxchgl %1,%2"
+		:"=a" (newcount)
+		:"r" (newcount), "m" (atomic->counter), "0" (counter));
+
+	/* If the above failed, "eax" will have changed */
+	if (newcount != counter)
+		goto repeat;
+	return 0;
+
+slow_path:
+	spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	spin_unlock(lock);
+	return 0;
+}
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
new file mode 100644
index 000000000000..6e2d66472eb1
--- /dev/null
+++ b/arch/x86_64/lib/delay.c
@@ -0,0 +1,48 @@
+/*
+ *	Precise Delay Loops for x86-64
+ *
+ *	Copyright (C) 1993 Linus Torvalds
+ *	Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *	The __delay function must _NOT_ be inlined as its execution time
+ *	depends wildly on alignment on many x86 processors. 
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/delay.h>
+
+#ifdef CONFIG_SMP
+#include <asm/smp.h>
+#endif
+
+int x86_udelay_tsc = 0;		/* Delay via TSC */
+
+void __delay(unsigned long loops)
+{
+	unsigned bclock, now;
+	
+	rdtscl(bclock);
+	do
+	{
+		rep_nop(); 
+		rdtscl(now);
+	}
+	while((now-bclock) < loops);
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+	__delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
+}
+
+void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs * 0x000010c6);  /* 2**32 / 1000000 */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+	__const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
+}
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S
new file mode 100644
index 000000000000..f94ea8a44051
--- /dev/null
+++ b/arch/x86_64/lib/getuser.S
@@ -0,0 +1,101 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __get_user_X
+ *
+ * Inputs:	%rcx contains the address.
+ *		The register is modified, but all changes are undone
+ *		before returning because the C code doesn't know about it.
+ *
+ * Outputs:	%rax is error code (0 or -EFAULT)
+ *		%rdx contains zero-extended value
+ * 
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/offset.h>
+#include <asm/thread_info.h>
+
+	.text
+	.p2align 4
+.globl __get_user_1
+__get_user_1:	
+	GET_THREAD_INFO(%r8)
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae bad_get_user
+1:	movzb (%rcx),%edx
+	xorl %eax,%eax
+	ret
+
+	.p2align 4
+.globl __get_user_2
+__get_user_2:
+	GET_THREAD_INFO(%r8)
+	addq $1,%rcx
+	jc 20f
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae 20f
+	decq   %rcx
+2:	movzwl (%rcx),%edx
+	xorl %eax,%eax
+	ret
+20:	decq    %rcx
+	jmp	bad_get_user
+
+	.p2align 4
+.globl __get_user_4
+__get_user_4:
+	GET_THREAD_INFO(%r8)
+	addq $3,%rcx
+	jc 30f
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae 30f
+	subq $3,%rcx
+3:	movl (%rcx),%edx
+	xorl %eax,%eax
+	ret
+30:	subq $3,%rcx
+	jmp bad_get_user
+
+	.p2align 4
+.globl __get_user_8
+__get_user_8:
+	GET_THREAD_INFO(%r8)
+	addq $7,%rcx
+	jc bad_get_user
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae	bad_get_user
+	subq	$7,%rcx
+4:	movq (%rcx),%rdx
+	xorl %eax,%eax
+	ret
+40:	subq $7,%rcx
+	jmp bad_get_user
+
+bad_get_user:
+	xorl %edx,%edx
+	movq $(-EFAULT),%rax
+	ret
+
+.section __ex_table,"a"
+	.quad 1b,bad_get_user
+	.quad 2b,bad_get_user
+	.quad 3b,bad_get_user
+	.quad 4b,bad_get_user
+.previous
diff --git a/arch/x86_64/lib/io.c b/arch/x86_64/lib/io.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86_64/lib/io.c
@@ -0,0 +1,23 @@
+#include <linux/string.h>
+#include <asm/io.h>
+#include <linux/module.h>
+
+void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
+{
+	__inline_memcpy((void *) dst,src,len);
+}
+EXPORT_SYMBOL(__memcpy_toio);
+
+void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
+{
+	__inline_memcpy(dst,(const void *) src,len);
+}
+EXPORT_SYMBOL(__memcpy_fromio);
+
+void memset_io(volatile void __iomem *a, int b, size_t c)
+{
+	/* XXX: memset can mangle the IO patterns quite a bit.
+	   perhaps it would be better to use a dumb one */
+	memset((void *)a,b,c);
+}
+EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
new file mode 100644
index 000000000000..c6c46494fef5
--- /dev/null
+++ b/arch/x86_64/lib/memcpy.S
@@ -0,0 +1,121 @@
+/* Copyright 2002 Andi Kleen */
+	
+	#include <asm/cpufeature.h>		
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:	
+ * rdi destination
+ * rsi source
+ * rdx count
+ * 
+ * Output:
+ * rax original destination
+ */	
+
+ 	.globl __memcpy
+	.globl memcpy
+	.p2align 4
+__memcpy:
+memcpy:		
+	pushq %rbx
+	movq %rdi,%rax
+
+	movl %edx,%ecx
+	shrl $6,%ecx
+	jz .Lhandle_tail
+	
+	.p2align 4
+.Lloop_64:
+	decl %ecx
+	
+	movq (%rsi),%r11
+	movq 8(%rsi),%r8
+
+	movq %r11,(%rdi)
+	movq %r8,1*8(%rdi)
+
+	movq 2*8(%rsi),%r9
+	movq 3*8(%rsi),%r10
+
+	movq %r9,2*8(%rdi)
+	movq %r10,3*8(%rdi)
+		
+	movq 4*8(%rsi),%r11
+	movq 5*8(%rsi),%r8
+
+	movq %r11,4*8(%rdi)
+	movq %r8,5*8(%rdi)
+
+	movq 6*8(%rsi),%r9
+	movq 7*8(%rsi),%r10
+
+	movq %r9,6*8(%rdi)
+	movq %r10,7*8(%rdi)
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	jnz  .Lloop_64
+
+.Lhandle_tail:
+	movl %edx,%ecx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	.p2align 4
+.Lloop_8: 
+	decl %ecx
+	movq (%rsi),%r8
+	movq %r8,(%rdi) 
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz  .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz .Lende
+	.p2align 4
+.Lloop_1:
+	movb (%rsi),%r8b
+	movb %r8b,(%rdi) 
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+	
+.Lende: 	
+	popq %rbx
+	ret
+.Lfinal:
+	
+	/* C stepping K8 run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+	
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memcpy
+	.quad  memcpy_c
+	.byte  X86_FEATURE_K8_C
+	.byte  .Lfinal-memcpy
+	.byte  memcpy_c_end-memcpy_c	
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi source
+  * rdx count
+  */			
+memcpy_c:
+	movq %rdi,%rax
+	movl %edx,%ecx
+	shrl $3,%ecx
+	andl $7,%edx	
+	rep 
+	movsq 
+	movl %edx,%ecx
+	rep
+	movsb
+	ret
+memcpy_c_end:
+	.previous
diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c
new file mode 100644
index 000000000000..e93d5255fdc9
--- /dev/null
+++ b/arch/x86_64/lib/memmove.c
@@ -0,0 +1,19 @@
+/* Normally compiler builtins are used, but sometimes the compiler calls out
+   of line code. Based on asm-i386/string.h.
+ */
+#define _STRING_C
+#include <linux/string.h>
+
+#undef memmove
+void *memmove(void * dest,const void *src,size_t count)
+{
+	if (dest < src) { 
+		__inline_memcpy(dest,src,count);
+	} else {
+		char *p = (char *) dest + count;
+		char *s = (char *) src + count;
+		while (count--)
+			*--p = *--s;
+	}
+	return dest;
+} 
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
new file mode 100644
index 000000000000..4b4c40638640
--- /dev/null
+++ b/arch/x86_64/lib/memset.S
@@ -0,0 +1,125 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *	
+ * rdi   destination
+ * rsi   value (char) 
+ * rdx   count (bytes) 
+ * 
+ * rax   original destination
+ */	
+ 	.globl __memset
+	.globl memset
+	.p2align 4
+memset:	
+__memset:
+	movq %rdi,%r10
+	movq %rdx,%r11
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	mul    %rcx		/* with rax, clobbers rdx */
+
+	/* align dst */
+	movl  %edi,%r9d		
+	andl  $7,%r9d	
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+	
+	movl %r11d,%ecx
+	shrl $6,%ecx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:	
+	decl   %ecx
+	movq  %rax,(%rdi) 
+	movq  %rax,8(%rdi) 
+	movq  %rax,16(%rdi) 
+	movq  %rax,24(%rdi) 
+	movq  %rax,32(%rdi) 
+	movq  %rax,40(%rdi) 
+	movq  %rax,48(%rdi) 
+	movq  %rax,56(%rdi) 
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */ 
+	.p2align 4	   
+.Lhandle_tail:
+	movl	%r11d,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	movl	%r11d,%ecx
+	andl	$7,%ecx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %ecx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+	
+.Lende:	
+	movq	%r10,%rax
+	ret
+
+.Lbad_alignment:
+	cmpq $7,%r11
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8			
+	subq %r9,%r8 
+	addq %r8,%rdi
+	subq %r8,%r11
+	jmp .Lafter_bad_alignment
+
+	/* C stepping K8 run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>	
+		
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memset
+	.quad  memset_c
+	.byte  X86_FEATURE_K8_C
+	.byte  memset_c_end-memset_c
+	.byte  memset_c_end-memset_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi value
+  * rdx count
+  */			
+memset_c:	
+	movq %rdi,%r9
+	movl %edx,%r8d
+	andl $7,%r8d		
+	movl %edx,%ecx
+	shrl $3,%ecx		
+	/* expand byte value  */
+	movzbl %sil,%esi
+	movabs $0x0101010101010101,%rax
+	mulq   %rsi		/* with rax, clobbers rdx */
+	rep
+	stosq	
+	movl %r8d,%ecx
+	rep
+	stosb
+	movq %r9,%rax
+	ret
+memset_c_end:
+	.previous
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S
new file mode 100644
index 000000000000..0dee1fdcb162
--- /dev/null
+++ b/arch/x86_64/lib/putuser.S
@@ -0,0 +1,89 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __put_user_X
+ *
+ * Inputs:	%rcx contains the address
+ *		%rdx contains new value
+ *
+ * Outputs:	%rax is error code (0 or -EFAULT)
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/offset.h>
+#include <asm/thread_info.h>
+
+	.text
+	.p2align 4
+.globl __put_user_1
+__put_user_1:
+	GET_THREAD_INFO(%r8)
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae bad_put_user
+1:	movb %dl,(%rcx)
+	xorl %eax,%eax
+	ret
+
+	.p2align 4
+.globl __put_user_2
+__put_user_2:
+	GET_THREAD_INFO(%r8)
+	addq $1,%rcx
+	jc bad_put_user
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae	 bad_put_user
+2:	movw %dx,-1(%rcx)
+	xorl %eax,%eax
+	ret
+
+	.p2align 4
+.globl __put_user_4
+__put_user_4:
+	GET_THREAD_INFO(%r8)
+	addq $3,%rcx
+	jc bad_put_user
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae bad_put_user
+3:	movl %edx,-3(%rcx)
+	xorl %eax,%eax
+	ret
+
+	.p2align 4
+.globl __put_user_8
+__put_user_8:
+	GET_THREAD_INFO(%r8)
+	addq $7,%rcx
+	jc bad_put_user
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae	bad_put_user
+4:	movq %rdx,-7(%rcx)
+	xorl %eax,%eax
+	ret
+
+bad_put_user:
+	movq $(-EFAULT),%rax
+	ret
+
+.section __ex_table,"a"
+	.quad 1b,bad_put_user
+	.quad 2b,bad_put_user
+	.quad 3b,bad_put_user
+	.quad 4b,bad_put_user
+.previous
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
new file mode 100644
index 000000000000..acc1e2ca7ed7
--- /dev/null
+++ b/arch/x86_64/lib/thunk.S
@@ -0,0 +1,95 @@
+	/*
+	 * Save registers before calling assembly functions. This avoids
+	 * disturbance of register allocation in some inline assembly constructs.
+	 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+	 * Subject to the GNU public license, v.2. No warranty of any kind.
+	 * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $
+	 */
+
+	#include <linux/config.h>
+	#include <linux/linkage.h>
+	#include <asm/dwarf2.h>
+	#include <asm/calling.h>			
+	#include <asm/rwlock.h>
+		
+	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */ 	
+	.macro thunk name,func
+	.globl \name
+\name:	
+	CFI_STARTPROC
+	SAVE_ARGS
+	call \func
+	jmp  restore
+	CFI_ENDPROC
+	.endm
+
+	/* rdi:	arg1 ... normal C conventions. rax is passed from C. */ 	
+	.macro thunk_retrax name,func
+	.globl \name
+\name:	
+	CFI_STARTPROC
+	SAVE_ARGS
+	call \func
+	jmp  restore_norax
+	CFI_ENDPROC
+	.endm
+	
+
+	.section .sched.text
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+	thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
+	thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
+	thunk rwsem_wake_thunk,rwsem_wake
+	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
+#endif	
+	thunk do_softirq_thunk,do_softirq
+	
+	thunk __down_failed,__down
+	thunk_retrax __down_failed_interruptible,__down_interruptible
+	thunk_retrax __down_failed_trylock,__down_trylock
+	thunk __up_wakeup,__up
+	
+	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
+	CFI_STARTPROC
+	SAVE_ARGS
+restore:
+	RESTORE_ARGS
+	ret	
+	CFI_ENDPROC
+	
+	CFI_STARTPROC
+	SAVE_ARGS
+restore_norax:	
+	RESTORE_ARGS 1
+	ret
+	CFI_ENDPROC
+
+#ifdef CONFIG_SMP
+/* Support for read/write spinlocks. */
+	.text
+/* rax:	pointer to rwlock_t */	
+ENTRY(__write_lock_failed)
+	lock
+	addl $RW_LOCK_BIAS,(%rax)
+1:	rep
+	nop
+	cmpl $RW_LOCK_BIAS,(%rax)
+	jne 1b
+	lock 
+	subl $RW_LOCK_BIAS,(%rax)
+	jnz  __write_lock_failed
+	ret
+
+/* rax:	pointer to rwlock_t */	
+ENTRY(__read_lock_failed)
+	lock
+	incl (%rax)
+1:	rep
+	nop
+	cmpl $1,(%rax)
+	js 1b
+	lock
+	decl (%rax)
+	js __read_lock_failed
+	ret
+#endif
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c
new file mode 100644
index 000000000000..db8abba1ad81
--- /dev/null
+++ b/arch/x86_64/lib/usercopy.c
@@ -0,0 +1,153 @@
+/* 
+ * User address space access functions.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ * Copyright 2002 Andi Kleen <ak@suse.de>
+ */
+#include <asm/uaccess.h>
+
+/*
+ * Copy a null terminated string from userspace.
+ */
+
+#define __do_strncpy_from_user(dst,src,count,res)			   \
+do {									   \
+	long __d0, __d1, __d2;						   \
+	might_sleep();							   \
+	__asm__ __volatile__(						   \
+		"	testq %1,%1\n"					   \
+		"	jz 2f\n"					   \
+		"0:	lodsb\n"					   \
+		"	stosb\n"					   \
+		"	testb %%al,%%al\n"				   \
+		"	jz 1f\n"					   \
+		"	decq %1\n"					   \
+		"	jnz 0b\n"					   \
+		"1:	subq %1,%0\n"					   \
+		"2:\n"							   \
+		".section .fixup,\"ax\"\n"				   \
+		"3:	movq %5,%0\n"					   \
+		"	jmp 2b\n"					   \
+		".previous\n"						   \
+		".section __ex_table,\"a\"\n"				   \
+		"	.align 8\n"					   \
+		"	.quad 0b,3b\n"					   \
+		".previous"						   \
+		: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),	   \
+		  "=&D" (__d2)						   \
+		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+		: "memory");						   \
+} while (0)
+
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+	long res;
+	__do_strncpy_from_user(dst, src, count, res);
+	return res;
+}
+
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+	long res = -EFAULT;
+	if (access_ok(VERIFY_READ, src, 1))
+		__do_strncpy_from_user(dst, src, count, res);
+	return res;
+}
+
+/*
+ * Zero Userspace
+ */
+
+unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+	long __d0;
+	might_sleep();
+	/* no memory constraint because it doesn't change any memory gcc knows
+	   about */
+	asm volatile(
+		"	testq  %[size8],%[size8]\n"
+		"	jz     4f\n"
+		"0:	movq %[zero],(%[dst])\n"
+		"	addq   %[eight],%[dst]\n"
+		"	decl %%ecx ; jnz   0b\n"
+		"4:	movq  %[size1],%%rcx\n"
+		"	testl %%ecx,%%ecx\n"
+		"	jz     2f\n"
+		"1:	movb   %b[zero],(%[dst])\n"
+		"	incq   %[dst]\n"
+		"	decl %%ecx ; jnz  1b\n"
+		"2:\n"
+		".section .fixup,\"ax\"\n"
+		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
+		"	jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"       .align 8\n"
+		"	.quad 0b,3b\n"
+		"	.quad 1b,2b\n"
+		".previous"
+		: [size8] "=c"(size), [dst] "=&D" (__d0)
+		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
+		  [zero] "r" (0UL), [eight] "r" (8UL));
+	return size;
+}
+
+
+unsigned long clear_user(void __user *to, unsigned long n)
+{
+	if (access_ok(VERIFY_WRITE, to, n))
+		return __clear_user(to, n);
+	return n;
+}
+
+/*
+ * Return the size of a string (including the ending 0)
+ *
+ * Return 0 on exception, a value greater than N if too long
+ */
+
+long strnlen_user(const char __user *s, long n)
+{
+	long res = 0;
+	char c;
+
+	if (!access_ok(VERIFY_READ, s, n))
+		return 0;
+
+	while (1) {
+		if (res>n)
+			return n+1;
+		if (__get_user(c, s))
+			return 0;
+		if (!c)
+			return res+1;
+		res++;
+		s++;
+	}
+}
+
+long strlen_user(const char __user *s)
+{
+	long res = 0;
+	char c;
+
+	for (;;) {
+		if (get_user(c, s))
+			return 0;
+		if (!c)
+			return res+1;
+		res++;
+		s++;
+	}
+}
+
+unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
+{
+	if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
+		return copy_user_generic((__force void *)to, (__force void *)from, len);
+	} 
+	return len;		
+}
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
new file mode 100644
index 000000000000..66c354ad80ca
--- /dev/null
+++ b/arch/x86_64/mm/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux x86_64-specific parts of the memory manager.
+#
+
+obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_K8_NUMA) += k8topology.o
+obj-$(CONFIG_ACPI_NUMA) += srat.o
+
+hugetlbpage-y = ../../i386/mm/hugetlbpage.o
diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c
new file mode 100644
index 000000000000..2d78f9fb4035
--- /dev/null
+++ b/arch/x86_64/mm/extable.c
@@ -0,0 +1,35 @@
+/*
+ * linux/arch/x86_64/mm/extable.c
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/* Simple binary search */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+	       const struct exception_table_entry *last,
+	       unsigned long value)
+{
+	/* Work around a B stepping K8 bug */
+	if ((value >> 32) == 0)
+		value |= 0xffffffffUL << 32; 
+
+        while (first <= last) {
+		const struct exception_table_entry *mid;
+		long diff;
+
+		mid = (last - first) / 2 + first;
+		diff = mid->insn - value;
+                if (diff == 0)
+                        return mid;
+                else if (diff < 0)
+                        first = mid+1;
+                else
+                        last = mid-1;
+        }
+        return NULL;
+}
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
new file mode 100644
index 000000000000..5724370475cc
--- /dev/null
+++ b/arch/x86_64/mm/fault.c
@@ -0,0 +1,579 @@
+/*
+ *  linux/arch/x86-64/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm/kdebug.h>
+#include <asm-generic/sections.h>
+#include <asm/kdebug.h>
+
+void bust_spinlocks(int yes)
+{
+	int loglevel_save = console_loglevel;
+	if (yes) {
+		oops_in_progress = 1;
+	} else {
+#ifdef CONFIG_VT
+		unblank_screen();
+#endif
+		oops_in_progress = 0;
+		/*
+		 * OK, the message is on the console.  Now we call printk()
+		 * without oops_in_progress set so that printk will give klogd
+		 * a poke.  Hold onto your hats...
+		 */
+		console_loglevel = 15;		/* NMI oopser may have shut the console up */
+		printk(" ");
+		console_loglevel = loglevel_save;
+	}
+}
+
+/* Sometimes the CPU reports invalid exceptions on prefetch.
+   Check that here and ignore.
+   Opcode checker based on code by Richard Brunner */
+static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+				unsigned long error_code)
+{ 
+	unsigned char *instr = (unsigned char *)(regs->rip);
+	int scan_more = 1;
+	int prefetch = 0; 
+	unsigned char *max_instr = instr + 15;
+
+	/* If it was a exec fault ignore */
+	if (error_code & (1<<4))
+		return 0;
+	
+	/* Code segments in LDT could have a non zero base. Don't check
+	   when that's possible */
+	if (regs->cs & (1<<2))
+		return 0;
+
+	if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE)
+		return 0;
+
+	while (scan_more && instr < max_instr) { 
+		unsigned char opcode;
+		unsigned char instr_hi;
+		unsigned char instr_lo;
+
+		if (__get_user(opcode, instr))
+			break; 
+
+		instr_hi = opcode & 0xf0; 
+		instr_lo = opcode & 0x0f; 
+		instr++;
+
+		switch (instr_hi) { 
+		case 0x20:
+		case 0x30:
+			/* Values 0x26,0x2E,0x36,0x3E are valid x86
+			   prefixes.  In long mode, the CPU will signal
+			   invalid opcode if some of these prefixes are
+			   present so we will never get here anyway */
+			scan_more = ((instr_lo & 7) == 0x6);
+			break;
+			
+		case 0x40:
+			/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
+			   Need to figure out under what instruction mode the
+			   instruction was issued ... */
+			/* Could check the LDT for lm, but for now it's good
+			   enough to assume that long mode only uses well known
+			   segments or kernel. */
+			scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
+			break;
+			
+		case 0x60:
+			/* 0x64 thru 0x67 are valid prefixes in all modes. */
+			scan_more = (instr_lo & 0xC) == 0x4;
+			break;		
+		case 0xF0:
+			/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
+			scan_more = !instr_lo || (instr_lo>>1) == 1;
+			break;			
+		case 0x00:
+			/* Prefetch instruction is 0x0F0D or 0x0F18 */
+			scan_more = 0;
+			if (__get_user(opcode, instr)) 
+				break;
+			prefetch = (instr_lo == 0xF) &&
+				(opcode == 0x0D || opcode == 0x18);
+			break;			
+		default:
+			scan_more = 0;
+			break;
+		} 
+	}
+	return prefetch;
+}
+
+static int bad_address(void *p) 
+{ 
+	unsigned long dummy;
+	return __get_user(dummy, (unsigned long *)p);
+} 
+
+void dump_pagetable(unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	asm("movq %%cr3,%0" : "=r" (pgd));
+
+	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
+	pgd += pgd_index(address);
+	printk("PGD %lx ", pgd_val(*pgd));
+	if (bad_address(pgd)) goto bad;
+	if (!pgd_present(*pgd)) goto ret; 
+
+	pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
+	if (bad_address(pud)) goto bad;
+	printk("PUD %lx ", pud_val(*pud));
+	if (!pud_present(*pud))	goto ret;
+
+	pmd = pmd_offset(pud, address);
+	if (bad_address(pmd)) goto bad;
+	printk("PMD %lx ", pmd_val(*pmd));
+	if (!pmd_present(*pmd))	goto ret;	 
+
+	pte = pte_offset_kernel(pmd, address);
+	if (bad_address(pte)) goto bad;
+	printk("PTE %lx", pte_val(*pte)); 
+ret:
+	printk("\n");
+	return;
+bad:
+	printk("BAD\n");
+}
+
+static const char errata93_warning[] = 
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+
+/* Workaround for K8 erratum #93 & buggy BIOS.
+   BIOS SMM functions are required to use a specific workaround
+   to avoid corruption of the 64bit RIP register on C stepping K8. 
+   A lot of BIOS that didn't get tested properly miss this. 
+   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+   Try to work around it here.
+   Note we only handle faults in kernel here. */
+
+static int is_errata93(struct pt_regs *regs, unsigned long address) 
+{
+	static int warned;
+	if (address != regs->rip)
+		return 0;
+	if ((address >> 32) != 0) 
+		return 0;
+	address |= 0xffffffffUL << 32;
+	if ((address >= (u64)_stext && address <= (u64)_etext) || 
+	    (address >= MODULES_VADDR && address <= MODULES_END)) { 
+		if (!warned) {
+			printk(errata93_warning); 		
+			warned = 1;
+		}
+		regs->rip = address;
+		return 1;
+	}
+	return 0;
+} 
+
+int unhandled_signal(struct task_struct *tsk, int sig)
+{
+	if (tsk->pid == 1)
+		return 1;
+	/* Warn for strace, but not for gdb */
+	if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) &&
+	    (tsk->ptrace & PT_PTRACED))
+		return 0;
+	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
+		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
+}
+
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+				 unsigned long error_code)
+{
+	oops_begin();
+	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+	       current->comm, address);
+	dump_pagetable(address);
+	__die("Bad pagetable", regs, error_code);
+	oops_end();
+	do_exit(SIGKILL);
+}
+
+/*
+ * Handle a fault on the vmalloc or module mapping area
+ */
+static int vmalloc_fault(unsigned long address)
+{
+	pgd_t *pgd, *pgd_ref;
+	pud_t *pud, *pud_ref;
+	pmd_t *pmd, *pmd_ref;
+	pte_t *pte, *pte_ref;
+
+	/* Copy kernel mappings over when needed. This can also
+	   happen within a race in page table update. In the later
+	   case just flush. */
+
+	pgd = pgd_offset(current->mm ?: &init_mm, address);
+	pgd_ref = pgd_offset_k(address);
+	if (pgd_none(*pgd_ref))
+		return -1;
+	if (pgd_none(*pgd))
+		set_pgd(pgd, *pgd_ref);
+
+	/* Below here mismatches are bugs because these lower tables
+	   are shared */
+
+	pud = pud_offset(pgd, address);
+	pud_ref = pud_offset(pgd_ref, address);
+	if (pud_none(*pud_ref))
+		return -1;
+	if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
+		BUG();
+	pmd = pmd_offset(pud, address);
+	pmd_ref = pmd_offset(pud_ref, address);
+	if (pmd_none(*pmd_ref))
+		return -1;
+	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+		BUG();
+	pte_ref = pte_offset_kernel(pmd_ref, address);
+	if (!pte_present(*pte_ref))
+		return -1;
+	pte = pte_offset_kernel(pmd, address);
+	if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref))
+		BUG();
+	__flush_tlb_all();
+	return 0;
+}
+
+int page_fault_trace = 0;
+int exception_trace = 1;
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ *
+ * error_code:
+ *	bit 0 == 0 means no page found, 1 means protection fault
+ *	bit 1 == 0 means read, 1 means write
+ *	bit 2 == 0 means kernel, 1 means user-mode
+ *      bit 3 == 1 means fault was an instruction fetch
+ */
+asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	struct vm_area_struct * vma;
+	unsigned long address;
+	const struct exception_table_entry *fixup;
+	int write;
+	siginfo_t info;
+
+#ifdef CONFIG_CHECKING
+	{ 
+		unsigned long gs; 
+		struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); 
+		rdmsrl(MSR_GS_BASE, gs); 
+		if (gs != (unsigned long)pda) { 
+			wrmsrl(MSR_GS_BASE, pda); 
+			printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
+		}
+	}
+#endif
+
+	/* get the address */
+	__asm__("movq %%cr2,%0":"=r" (address));
+	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+					SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (likely(regs->eflags & X86_EFLAGS_IF))
+		local_irq_enable();
+
+	if (unlikely(page_fault_trace))
+		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
+		       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
+
+	tsk = current;
+	mm = tsk->mm;
+	info.si_code = SEGV_MAPERR;
+
+
+	/*
+	 * We fault-in kernel-space virtual memory on-demand. The
+	 * 'reference' page table is init_mm.pgd.
+	 *
+	 * NOTE! We MUST NOT take any locks for this case. We may
+	 * be in an interrupt or a critical region, and should
+	 * only copy the information from the master page table,
+	 * nothing more.
+	 *
+	 * This verifies that the fault happens in kernel space
+	 * (error_code & 4) == 0, and that the fault was not a
+	 * protection error (error_code & 1) == 0.
+	 */
+	if (unlikely(address >= TASK_SIZE)) {
+		if (!(error_code & 5)) {
+			if (vmalloc_fault(address) < 0)
+				goto bad_area_nosemaphore;
+			return;
+		}
+		/*
+		 * Don't take the mm semaphore here. If we fixup a prefetch
+		 * fault we could otherwise deadlock.
+		 */
+		goto bad_area_nosemaphore;
+	}
+
+	if (unlikely(error_code & (1 << 3)))
+		pgtable_bad(address, regs, error_code);
+
+	/*
+	 * If we're in an interrupt or have no user
+	 * context, we must not take the fault..
+	 */
+	if (unlikely(in_atomic() || !mm))
+		goto bad_area_nosemaphore;
+
+ again:
+	/* When running in the kernel we expect faults to occur only to
+	 * addresses in user space.  All other faults represent errors in the
+	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an
+	 * erroneous fault occuring in a code path which already holds mmap_sem
+	 * we will deadlock attempting to validate the fault against the
+	 * address space.  Luckily the kernel only validly references user
+	 * space from well defined areas of code, which are listed in the
+	 * exceptions table.
+	 *
+	 * As the vast majority of faults will be valid we will only perform
+	 * the source reference check when there is a possibilty of a deadlock.
+	 * Attempt to lock the address space, if we cannot we then validate the
+	 * source.  If this is invalid we can skip the address space check,
+	 * thus avoiding the deadlock.
+	 */
+	if (!down_read_trylock(&mm->mmap_sem)) {
+		if ((error_code & 4) == 0 &&
+		    !search_exception_tables(regs->rip))
+			goto bad_area_nosemaphore;
+		down_read(&mm->mmap_sem);
+	}
+
+	vma = find_vma(mm, address);
+	if (!vma)
+		goto bad_area;
+	if (likely(vma->vm_start <= address))
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+	if (error_code & 4) {
+		// XXX: align red zone size with ABI 
+		if (address + 128 < regs->rsp)
+			goto bad_area;
+	}
+	if (expand_stack(vma, address))
+		goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+	info.si_code = SEGV_ACCERR;
+	write = 0;
+	switch (error_code & 3) {
+		default:	/* 3: write, present */
+			/* fall through */
+		case 2:		/* write, not present */
+			if (!(vma->vm_flags & VM_WRITE))
+				goto bad_area;
+			write++;
+			break;
+		case 1:		/* read, present */
+			goto bad_area;
+		case 0:		/* read, not present */
+			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+				goto bad_area;
+	}
+
+	/*
+	 * If for any reason at all we couldn't handle the fault,
+	 * make sure we exit gracefully rather than endlessly redo
+	 * the fault.
+	 */
+	switch (handle_mm_fault(mm, vma, address, write)) {
+	case 1:
+		tsk->min_flt++;
+		break;
+	case 2:
+		tsk->maj_flt++;
+		break;
+	case 0:
+		goto do_sigbus;
+	default:
+		goto out_of_memory;
+	}
+
+	up_read(&mm->mmap_sem);
+	return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+	up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+
+#ifdef CONFIG_IA32_EMULATION
+	/* 32bit vsyscall. map on demand. */
+	if (test_thread_flag(TIF_IA32) &&
+	    address >= VSYSCALL32_BASE && address < VSYSCALL32_END) {
+		if (map_syscall32(mm, address) < 0)
+			goto out_of_memory2;
+		return;
+	}
+#endif
+
+	/* User mode accesses just cause a SIGSEGV */
+	if (error_code & 4) {
+		if (is_prefetch(regs, address, error_code))
+			return;
+
+		/* Work around K8 erratum #100 K8 in compat mode
+		   occasionally jumps to illegal addresses >4GB.  We
+		   catch this here in the page fault handler because
+		   these addresses are not reachable. Just detect this
+		   case and return.  Any code segment in LDT is
+		   compatibility mode. */
+		if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+		    (address >> 32))
+			return;
+
+		if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
+			printk(
+		       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
+					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+					tsk->comm, tsk->pid, address, regs->rip,
+					regs->rsp, error_code);
+		}
+       
+		tsk->thread.cr2 = address;
+		/* Kernel addresses are always protection faults */
+		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+		tsk->thread.trap_no = 14;
+		info.si_signo = SIGSEGV;
+		info.si_errno = 0;
+		/* info.si_code has been set above */
+		info.si_addr = (void __user *)address;
+		force_sig_info(SIGSEGV, &info, tsk);
+		return;
+	}
+
+no_context:
+	
+	/* Are we prepared to handle this kernel fault?  */
+	fixup = search_exception_tables(regs->rip);
+	if (fixup) {
+		regs->rip = fixup->fixup;
+		return;
+	}
+
+	/* 
+	 * Hall of shame of CPU/BIOS bugs.
+	 */
+
+ 	if (is_prefetch(regs, address, error_code))
+ 		return;
+
+	if (is_errata93(regs, address))
+		return; 
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+
+	oops_begin(); 
+
+	if (address < PAGE_SIZE)
+		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+	else
+		printk(KERN_ALERT "Unable to handle kernel paging request");
+	printk(" at %016lx RIP: \n" KERN_ALERT,address);
+	printk_address(regs->rip);
+	printk("\n");
+	dump_pagetable(address);
+	__die("Oops", regs, error_code);
+	/* Executive summary in case the body of the oops scrolled away */
+	printk(KERN_EMERG "CR2: %016lx\n", address);
+	oops_end(); 
+	do_exit(SIGKILL);
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+	up_read(&mm->mmap_sem);
+out_of_memory2:
+	if (current->pid == 1) { 
+		yield();
+		goto again;
+	}
+	printk("VM: killing process %s\n", tsk->comm);
+	if (error_code & 4)
+		do_exit(SIGKILL);
+	goto no_context;
+
+do_sigbus:
+	up_read(&mm->mmap_sem);
+
+	/* Kernel mode? Handle exceptions or die */
+	if (!(error_code & 4))
+		goto no_context;
+
+	tsk->thread.cr2 = address;
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 14;
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = BUS_ADRERR;
+	info.si_addr = (void __user *)address;
+	force_sig_info(SIGBUS, &info, tsk);
+	return;
+}
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
new file mode 100644
index 000000000000..b0d604551d86
--- /dev/null
+++ b/arch/x86_64/mm/init.c
@@ -0,0 +1,630 @@
+/*
+ *  linux/arch/x86_64/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/proc_fs.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+#ifdef CONFIG_GART_IOMMU
+extern int swiotlb;
+#endif
+
+extern char _stext[];
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+void show_mem(void)
+{
+	int i, total = 0, reserved = 0;
+	int shared = 0, cached = 0;
+	pg_data_t *pgdat;
+	struct page *page;
+
+	printk("Mem-info:\n");
+	show_free_areas();
+	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+
+	for_each_pgdat(pgdat) {
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+			page = pfn_to_page(pgdat->node_start_pfn + i);
+			total++;
+                       if (PageReserved(page))
+			reserved++;
+                       else if (PageSwapCache(page))
+			cached++;
+                       else if (page_count(page))
+                               shared += page_count(page) - 1;
+               }
+	}
+	printk("%d pages of RAM\n", total);
+	printk("%d reserved pages\n",reserved);
+	printk("%d pages shared\n",shared);
+	printk("%d pages swap cached\n",cached);
+}
+
+/* References to section boundaries */
+
+extern char _text, _etext, _edata, __bss_start, _end[];
+extern char __init_begin, __init_end;
+
+int after_bootmem;
+
+static void *spp_getpage(void)
+{ 
+	void *ptr;
+	if (after_bootmem)
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
+	else
+		ptr = alloc_bootmem_pages(PAGE_SIZE);
+	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
+		panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
+
+	Dprintk("spp_getpage %p\n", ptr);
+	return ptr;
+} 
+
+static void set_pte_phys(unsigned long vaddr,
+			 unsigned long phys, pgprot_t prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, new_pte;
+
+	Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+
+	pgd = pgd_offset_k(vaddr);
+	if (pgd_none(*pgd)) {
+		printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		pmd = (pmd_t *) spp_getpage(); 
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+		if (pmd != pmd_offset(pud, 0)) {
+			printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+			return;
+		}
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		pte = (pte_t *) spp_getpage();
+		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+		if (pte != pte_offset_kernel(pmd, 0)) {
+			printk("PAGETABLE BUG #02!\n");
+			return;
+		}
+	}
+	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
+
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (!pte_none(*pte) &&
+	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
+		pte_ERROR(*pte);
+	set_pte(pte, new_pte);
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+/* NOTE: this is meant to be run only at boot */
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+{
+	unsigned long address = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		printk("Invalid __set_fixmap\n");
+		return;
+	}
+	set_pte_phys(address, phys, prot);
+}
+
+unsigned long __initdata table_start, table_end; 
+
+extern pmd_t temp_boot_pmds[]; 
+
+static  struct temp_map { 
+	pmd_t *pmd;
+	void  *address; 
+	int    allocated; 
+} temp_mappings[] __initdata = { 
+	{ &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
+	{ &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
+	{}
+}; 
+
+static __init void *alloc_low_page(int *index, unsigned long *phys) 
+{ 
+	struct temp_map *ti;
+	int i; 
+	unsigned long pfn = table_end++, paddr; 
+	void *adr;
+
+	if (pfn >= end_pfn) 
+		panic("alloc_low_page: ran out of memory"); 
+	for (i = 0; temp_mappings[i].allocated; i++) {
+		if (!temp_mappings[i].pmd) 
+			panic("alloc_low_page: ran out of temp mappings"); 
+	} 
+	ti = &temp_mappings[i];
+	paddr = (pfn << PAGE_SHIFT) & PMD_MASK; 
+	set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); 
+	ti->allocated = 1; 
+	__flush_tlb(); 	       
+	adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
+	*index = i; 
+	*phys  = pfn * PAGE_SIZE;  
+	return adr; 
+} 
+
+static __init void unmap_low_page(int i)
+{ 
+	struct temp_map *ti = &temp_mappings[i];
+	set_pmd(ti->pmd, __pmd(0));
+	ti->allocated = 0; 
+} 
+
+static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{ 
+	long i, j; 
+
+	i = pud_index(address);
+	pud = pud + i;
+	for (; i < PTRS_PER_PUD; pud++, i++) {
+		int map; 
+		unsigned long paddr, pmd_phys;
+		pmd_t *pmd;
+
+		paddr = address + i*PUD_SIZE;
+		if (paddr >= end) { 
+			for (; i < PTRS_PER_PUD; i++, pud++) 
+				set_pud(pud, __pud(0)); 
+			break;
+		} 
+
+		if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 
+			set_pud(pud, __pud(0)); 
+			continue;
+		} 
+
+		pmd = alloc_low_page(&map, &pmd_phys);
+		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
+		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+			unsigned long pe;
+
+			if (paddr >= end) { 
+				for (; j < PTRS_PER_PMD; j++, pmd++)
+					set_pmd(pmd,  __pmd(0)); 
+				break;
+		}
+			pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
+			pe &= __supported_pte_mask;
+			set_pmd(pmd, __pmd(pe));
+		}
+		unmap_low_page(map);
+	}
+	__flush_tlb();
+} 
+
+static void __init find_early_table_space(unsigned long end)
+{
+	unsigned long puds, pmds, tables;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
+		 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+	table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
+	if (table_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	table_start >>= PAGE_SHIFT;
+	table_end = table_start;
+}
+
+/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
+   This runs before bootmem is initialized and gets pages directly from the 
+   physical memory. To access them they are temporarily mapped. */
+void __init init_memory_mapping(unsigned long start, unsigned long end)
+{ 
+	unsigned long next; 
+
+	Dprintk("init_memory_mapping\n");
+
+	/* 
+	 * Find space for the kernel direct mapping tables.
+	 * Later we should allocate these tables in the local node of the memory
+	 * mapped.  Unfortunately this is done currently before the nodes are 
+	 * discovered.
+	 */
+	find_early_table_space(end);
+
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);
+
+	for (; start < end; start = next) {
+		int map;
+		unsigned long pud_phys; 
+		pud_t *pud = alloc_low_page(&map, &pud_phys);
+		next = start + PGDIR_SIZE;
+		if (next > end) 
+			next = end; 
+		phys_pud_init(pud, __pa(start), __pa(next));
+		set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+		unmap_low_page(map);   
+	} 
+
+	asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+	__flush_tlb_all();
+	early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 
+	       table_start<<PAGE_SHIFT, 
+	       table_end<<PAGE_SHIFT);
+}
+
+extern struct x8664_pda cpu_pda[NR_CPUS];
+
+/* Assumes all CPUs still execute in init_mm */
+void zap_low_mappings(void)
+{
+	pgd_t *pgd = pgd_offset_k(0UL);
+	pgd_clear(pgd);
+	flush_tlb_all();
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+void __init paging_init(void)
+{
+	{
+		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+		unsigned int max_dma;
+
+		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+		if (end_pfn < max_dma)
+			zones_size[ZONE_DMA] = end_pfn;
+		else {
+			zones_size[ZONE_DMA] = max_dma;
+			zones_size[ZONE_NORMAL] = end_pfn - max_dma;
+		}
+		free_area_init(zones_size);
+	}
+	return;
+}
+#endif
+
+/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
+   from the CPU leading to inconsistent cache lines. address and size
+   must be aligned to 2MB boundaries. 
+   Does nothing when the mapping doesn't exist. */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+{
+	unsigned long end = address + size;
+
+	BUG_ON(address & ~LARGE_PAGE_MASK);
+	BUG_ON(size & ~LARGE_PAGE_MASK); 
+	
+	for (; address < end; address += LARGE_PAGE_SIZE) { 
+		pgd_t *pgd = pgd_offset_k(address);
+		pud_t *pud;
+		pmd_t *pmd;
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, address);
+		if (pud_none(*pud))
+			continue; 
+		pmd = pmd_offset(pud, address);
+		if (!pmd || pmd_none(*pmd))
+			continue; 
+		if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
+			/* Could handle this, but it should not happen currently. */
+			printk(KERN_ERR 
+	       "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
+			pmd_ERROR(*pmd); 
+		}
+		set_pmd(pmd, __pmd(0)); 		
+	}
+	__flush_tlb_all();
+} 
+
+static inline int page_is_ram (unsigned long pagenr)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		unsigned long addr, end;
+
+		if (e820.map[i].type != E820_RAM)	/* not usable memory */
+			continue;
+		/*
+		 *	!!!FIXME!!! Some BIOSen report areas as RAM that
+		 *	are not. Notably the 640->1Mb area. We need a sanity
+		 *	check here.
+		 */
+		addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+		end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
+		if  ((pagenr >= addr) && (pagenr < end))
+			return 1;
+	}
+	return 0;
+}
+
+extern int swiotlb_force;
+
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
+			 kcore_vsyscall;
+
+void __init mem_init(void)
+{
+	int codesize, reservedpages, datasize, initsize;
+	int tmp;
+
+#ifdef CONFIG_SWIOTLB
+	if (swiotlb_force)
+		swiotlb = 1;
+	if (!iommu_aperture &&
+	    (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
+	       swiotlb = 1;
+	if (swiotlb)
+		swiotlb_init();	
+#endif
+
+	/* How many end-of-memory variables you have, grandma! */
+	max_low_pfn = end_pfn;
+	max_pfn = end_pfn;
+	num_physpages = end_pfn;
+	high_memory = (void *) __va(end_pfn * PAGE_SIZE);
+
+	/* clear the zero-page */
+	memset(empty_zero_page, 0, PAGE_SIZE);
+
+	reservedpages = 0;
+
+	/* this will put all low memory onto the freelists */
+#ifdef CONFIG_DISCONTIGMEM
+	totalram_pages += numa_free_all_bootmem();
+	tmp = 0;
+	/* should count reserved pages here for all nodes */ 
+#else
+	max_mapnr = end_pfn;
+	if (!mem_map) BUG();
+
+	totalram_pages += free_all_bootmem();
+
+	for (tmp = 0; tmp < end_pfn; tmp++)
+		/*
+		 * Only count reserved RAM pages
+		 */
+		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+			reservedpages++;
+#endif
+
+	after_bootmem = 1;
+
+	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+	/* Register memory areas for /proc/kcore */
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+		   VMALLOC_END-VMALLOC_START);
+	kclist_add(&kcore_kernel, &_stext, _end - _stext);
+	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
+				 VSYSCALL_END - VSYSCALL_START);
+
+	printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
+		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		end_pfn << (PAGE_SHIFT-10),
+		codesize >> 10,
+		reservedpages << (PAGE_SHIFT-10),
+		datasize >> 10,
+		initsize >> 10);
+
+	/*
+	 * Subtle. SMP is doing its boot stuff late (because it has to
+	 * fork idle threads) - but it also needs low mappings for the
+	 * protected-mode entry to work. We zap these entries only after
+	 * the WP-bit has been tested.
+	 */
+#ifndef CONFIG_SMP
+	zap_low_mappings();
+#endif
+}
+
+extern char __initdata_begin[], __initdata_end[];
+
+void free_initmem(void)
+{
+	unsigned long addr;
+
+	addr = (unsigned long)(&__init_begin);
+	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(addr));
+		set_page_count(virt_to_page(addr), 1);
+		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
+		free_page(addr);
+		totalram_pages++;
+	}
+	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
+	printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	if (start < (unsigned long)&_end)
+		return;
+	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		set_page_count(virt_to_page(start), 1);
+		free_page(start);
+		totalram_pages++;
+	}
+}
+#endif
+
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
+{ 
+	/* Should check here against the e820 map to avoid double free */ 
+#ifdef CONFIG_DISCONTIGMEM
+	int nid = phys_to_nid(phys);
+  	reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else       		
+	reserve_bootmem(phys, len);    
+#endif
+}
+
+int kern_addr_valid(unsigned long addr) 
+{ 
+	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+	if (above != 0 && above != -1UL)
+		return 0; 
+	
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud))
+		return 0; 
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return 0;
+	if (pmd_large(*pmd))
+		return pfn_valid(pmd_pfn(*pmd));
+
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_none(*pte))
+		return 0;
+	return pfn_valid(pte_pfn(*pte));
+}
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+
+extern int exception_trace, page_fault_trace;
+
+static ctl_table debug_table2[] = {
+	{ 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
+	  proc_dointvec },
+#ifdef CONFIG_CHECKING
+	{ 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
+	  proc_dointvec },
+#endif
+	{ 0, }
+}; 
+
+static ctl_table debug_root_table2[] = { 
+	{ .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
+	   .child = debug_table2 }, 
+	{ 0 }, 
+}; 
+
+static __init int x8664_sysctl_init(void)
+{ 
+	register_sysctl_table(debug_root_table2, 1);
+	return 0;
+}
+__initcall(x8664_sysctl_init);
+#endif
+
+/* Pseudo VMAs to allow ptrace access for the vsyscall pages.  x86-64 has two
+   different ones: one for 32bit and one for 64bit. Use the appropiate
+   for the target task. */
+
+static struct vm_area_struct gate_vma = {
+	.vm_start = VSYSCALL_START,
+	.vm_end = VSYSCALL_END,
+	.vm_page_prot = PAGE_READONLY
+};
+
+static struct vm_area_struct gate32_vma = {
+	.vm_start = VSYSCALL32_BASE,
+	.vm_end = VSYSCALL32_END,
+	.vm_page_prot = PAGE_READONLY
+};
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+		/* lookup code assumes the pages are present. set them up
+		   now */
+		if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0)
+			return NULL;
+		return &gate32_vma;
+	}
+#endif
+	return &gate_vma;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	struct vm_area_struct *vma = get_gate_vma(task);
+	return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/* Use this when you have no reliable task/vma, typically from interrupt
+ * context.  It is less reliable than using the task's vma and may give
+ * false positives.
+ */
+int in_gate_area_no_task(unsigned long addr)
+{
+	return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) ||
+		((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END)));
+}
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
new file mode 100644
index 000000000000..74ec8554b195
--- /dev/null
+++ b/arch/x86_64/mm/ioremap.c
@@ -0,0 +1,283 @@
+/*
+ * arch/x86_64/mm/ioremap.c
+ *
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+
+#define ISA_START_ADDRESS      0xa0000
+#define ISA_END_ADDRESS                0x100000
+
+static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
+	unsigned long phys_addr, unsigned long flags)
+{
+	unsigned long end;
+	unsigned long pfn;
+
+	address &= ~PMD_MASK;
+	end = address + size;
+	if (end > PMD_SIZE)
+		end = PMD_SIZE;
+	if (address >= end)
+		BUG();
+	pfn = phys_addr >> PAGE_SHIFT;
+	do {
+		if (!pte_none(*pte)) {
+			printk("remap_area_pte: page already exists\n");
+			BUG();
+		}
+		set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | 
+					_PAGE_GLOBAL | _PAGE_DIRTY | _PAGE_ACCESSED | flags)));
+		address += PAGE_SIZE;
+		pfn++;
+		pte++;
+	} while (address && (address < end));
+}
+
+static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
+	unsigned long phys_addr, unsigned long flags)
+{
+	unsigned long end;
+
+	address &= ~PUD_MASK;
+	end = address + size;
+	if (end > PUD_SIZE)
+		end = PUD_SIZE;
+	phys_addr -= address;
+	if (address >= end)
+		BUG();
+	do {
+		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		if (!pte)
+			return -ENOMEM;
+		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
+		address = (address + PMD_SIZE) & PMD_MASK;
+		pmd++;
+	} while (address && (address < end));
+	return 0;
+}
+
+static inline int remap_area_pud(pud_t * pud, unsigned long address, unsigned long size,
+	unsigned long phys_addr, unsigned long flags)
+{
+	unsigned long end;
+
+	address &= ~PGDIR_MASK;
+	end = address + size;
+	if (end > PGDIR_SIZE)
+		end = PGDIR_SIZE;
+	phys_addr -= address;
+	if (address >= end)
+		BUG();
+	do {
+		pmd_t * pmd = pmd_alloc(&init_mm, pud, address);
+		if (!pmd)
+			return -ENOMEM;
+		remap_area_pmd(pmd, address, end - address, address + phys_addr, flags);
+		address = (address + PUD_SIZE) & PUD_MASK;
+		pud++;
+	} while (address && (address < end));
+	return 0;
+}
+
+static int remap_area_pages(unsigned long address, unsigned long phys_addr,
+				 unsigned long size, unsigned long flags)
+{
+	int error;
+	pgd_t *pgd;
+	unsigned long end = address + size;
+
+	phys_addr -= address;
+	pgd = pgd_offset_k(address);
+	flush_cache_all();
+	if (address >= end)
+		BUG();
+	spin_lock(&init_mm.page_table_lock);
+	do {
+		pud_t *pud;
+		pud = pud_alloc(&init_mm, pgd, address);
+		error = -ENOMEM;
+		if (!pud)
+			break;
+		if (remap_area_pud(pud, address, end - address,
+					 phys_addr + address, flags))
+			break;
+		error = 0;
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		pgd++;
+	} while (address && (address < end));
+	spin_unlock(&init_mm.page_table_lock);
+	flush_tlb_all();
+	return error;
+}
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int
+ioremap_change_attr(unsigned long phys_addr, unsigned long size,
+					unsigned long flags)
+{
+	int err = 0;
+	if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
+		unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long vaddr = (unsigned long) __va(phys_addr);
+
+		/*
+ 		 * Must use a address here and not struct page because the phys addr
+		 * can be a in hole between nodes and not have an memmap entry.
+		 */
+		err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
+		if (!err)
+			global_flush_tlb();
+	}
+	return err;
+}
+
+/*
+ * Generic mapping function
+ */
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+{
+	void * addr;
+	struct vm_struct * area;
+	unsigned long offset, last_addr;
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr)
+		return NULL;
+
+	/*
+	 * Don't remap the low PCI/ISA area, it's always mapped..
+	 */
+	if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+		return (__force void __iomem *)phys_to_virt(phys_addr);
+
+#ifndef CONFIG_DISCONTIGMEM
+	/*
+	 * Don't allow anybody to remap normal RAM that we're using..
+	 */
+	if (last_addr < virt_to_phys(high_memory)) {
+		char *t_addr, *t_end;
+ 		struct page *page;
+
+		t_addr = __va(phys_addr);
+		t_end = t_addr + (size - 1);
+	   
+		for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
+			if(!PageReserved(page))
+				return NULL;
+	}
+#endif
+
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+	/*
+	 * Ok, go for it..
+	 */
+	area = get_vm_area(size, VM_IOREMAP | (flags << 20));
+	if (!area)
+		return NULL;
+	area->phys_addr = phys_addr;
+	addr = area->addr;
+	if (remap_area_pages((unsigned long) addr, phys_addr, size, flags)) {
+		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+		return NULL;
+	}
+	if (ioremap_change_attr(phys_addr, size, flags) < 0) {
+		area->flags &= 0xffffff;
+		vunmap(addr);
+		return NULL;
+	}
+	return (__force void __iomem *) (offset + (char *)addr);
+}
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address. 
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many 
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ * 
+ * Must be freed with iounmap.
+ */
+
+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+	return __ioremap(phys_addr, size, _PAGE_PCD);
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+	struct vm_struct *p, **pprev;
+
+	if (addr <= high_memory) 
+		return; 
+	if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+		addr < phys_to_virt(ISA_END_ADDRESS))
+		return;
+
+	write_lock(&vmlist_lock);
+	for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev)
+		if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr))
+			break;
+	if (!p) { 
+		printk("__iounmap: bad address %p\n", addr);
+		goto out_unlock;
+	}
+	*pprev = p->next;
+	unmap_vm_area(p);
+	if ((p->flags >> 20) &&
+		p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) {
+		/* p->size includes the guard page, but cpa doesn't like that */
+		change_page_attr(virt_to_page(__va(p->phys_addr)),
+				 p->size >> PAGE_SHIFT,
+				 PAGE_KERNEL);
+		global_flush_tlb();
+	} 
+out_unlock:
+	write_unlock(&vmlist_lock);
+	kfree(p); 
+}
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
new file mode 100644
index 000000000000..ec35747aacd7
--- /dev/null
+++ b/arch/x86_64/mm/k8topology.c
@@ -0,0 +1,168 @@
+/* 
+ * AMD K8 NUMA support.
+ * Discover the memory map and associated nodes.
+ * 
+ * This version reads it directly from the K8 northbridge.
+ * 
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+
+static __init int find_northbridge(void)
+{
+	int num; 
+
+	for (num = 0; num < 32; num++) { 
+		u32 header;
+		
+		header = read_pci_config(0, num, 0, 0x00);  
+		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
+			continue; 	
+
+		header = read_pci_config(0, num, 1, 0x00); 
+		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
+			continue;	
+		return num; 
+	} 
+
+	return -1; 	
+}
+
+int __init k8_scan_nodes(unsigned long start, unsigned long end)
+{ 
+	unsigned long prevbase;
+	struct node nodes[8];
+	int nodeid, i, nb; 
+	int found = 0;
+	u32 reg;
+	unsigned numnodes;
+	nodemask_t nodes_parsed;
+
+	nodes_clear(nodes_parsed);
+
+	nb = find_northbridge(); 
+	if (nb < 0) 
+		return nb;
+
+	printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 
+
+	reg = read_pci_config(0, nb, 0, 0x60); 
+	numnodes = ((reg >> 4) & 0xF) + 1;
+
+	printk(KERN_INFO "Number of nodes %d\n", numnodes);
+
+	memset(&nodes,0,sizeof(nodes)); 
+	prevbase = 0;
+	for (i = 0; i < 8; i++) { 
+		unsigned long base,limit; 
+
+		base = read_pci_config(0, nb, 1, 0x40 + i*8);
+		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+		nodeid = limit & 7; 
+		if ((base & 3) == 0) { 
+			if (i < numnodes)
+				printk("Skipping disabled node %d\n", i); 
+			continue;
+		} 
+		if (nodeid >= numnodes) {
+			printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+			       base, limit); 
+			continue;
+		} 
+
+		if (!limit) { 
+			printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
+			       base);
+			continue;
+		}
+		if ((base >> 8) & 3 || (limit >> 8) & 3) {
+			printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 
+			       nodeid, (base>>8)&3, (limit>>8) & 3); 
+			return -1; 
+		}	
+		if (node_isset(nodeid, nodes_parsed)) { 
+			printk(KERN_INFO "Node %d already present. Skipping\n", 
+			       nodeid);
+			continue;
+		}
+
+		limit >>= 16; 
+		limit <<= 24; 
+		limit |= (1<<24)-1;
+
+		if (limit > end_pfn << PAGE_SHIFT)
+			limit = end_pfn << PAGE_SHIFT;
+		if (limit <= base)
+			continue; 
+			
+		base >>= 16;
+		base <<= 24; 
+
+		if (base < start) 
+			base = start; 
+		if (limit > end) 
+			limit = end; 
+		if (limit == base) { 
+			printk(KERN_ERR "Empty node %d\n", nodeid); 
+			continue; 
+		}
+		if (limit < base) { 
+			printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
+			       nodeid, base, limit); 			       
+			continue;
+		} 
+		
+		/* Could sort here, but pun for now. Should not happen anyroads. */
+		if (prevbase > base) { 
+			printk(KERN_ERR "Node map not sorted %lx,%lx\n",
+			       prevbase,base);
+			return -1;
+		}
+			
+		printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 
+		       nodeid, base, limit); 
+		
+		found++;
+		
+		nodes[nodeid].start = base; 
+		nodes[nodeid].end = limit;
+
+		prevbase = base;
+
+		node_set(nodeid, nodes_parsed);
+	} 
+
+	if (!found)
+		return -1; 
+
+	memnode_shift = compute_hash_shift(nodes, numnodes);
+	if (memnode_shift < 0) { 
+		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
+		return -1; 
+	} 
+	printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 
+
+	for (i = 0; i < 8; i++) {
+		if (nodes[i].start != nodes[i].end) { 
+			/* assume 1:1 NODE:CPU */
+			cpu_to_node[i] = i; 
+			setup_node_bootmem(i, nodes[i].start, nodes[i].end); 
+		} 
+	}
+
+	numa_init_array();
+	return 0;
+} 
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
new file mode 100644
index 000000000000..fd9f25d7a6c4
--- /dev/null
+++ b/arch/x86_64/mm/numa.c
@@ -0,0 +1,294 @@
+/* 
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */ 
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+struct pglist_data *node_data[MAX_NUMNODES];
+bootmem_data_t plat_node_bdata[MAX_NUMNODES];
+
+int memnode_shift;
+u8  memnodemap[NODEMAPSIZE];
+
+unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
+cpumask_t     node_to_cpumask[MAX_NUMNODES];
+
+int numa_off __initdata;
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+	int i; 
+	int shift = 24;
+	u64 addr;
+	
+	/* When in doubt use brute force. */
+	while (shift < 48) { 
+		memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); 
+		for (i = 0; i < numnodes; i++) {
+			if (nodes[i].start == nodes[i].end) 
+				continue;
+			for (addr = nodes[i].start; 
+			     addr < nodes[i].end; 
+			     addr += (1UL << shift)) {
+				if (memnodemap[addr >> shift] != 0xff && 
+				    memnodemap[addr >> shift] != i) { 
+					printk(KERN_INFO 
+					    "node %d shift %d addr %Lx conflict %d\n", 
+					       i, shift, addr, memnodemap[addr>>shift]);
+					goto next; 
+				} 
+				memnodemap[addr >> shift] = i; 
+			} 
+		} 
+		return shift; 
+	next:
+		shift++; 
+	} 
+	memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); 
+	return -1; 
+}
+
+/* Initialize bootmem allocator for a node */
+void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
+{ 
+	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
+	unsigned long nodedata_phys;
+	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+
+	start = round_up(start, ZONE_ALIGN); 
+
+	printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
+	nodedata_phys = find_e820_area(start, end, pgdat_size); 
+	if (nodedata_phys == -1L) 
+		panic("Cannot find memory pgdat in node %d\n", nodeid);
+
+	Dprintk("nodedata_phys %lx\n", nodedata_phys); 
+
+	node_data[nodeid] = phys_to_virt(nodedata_phys);
+	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
+	NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
+	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+	NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+	/* Find a place for the bootmem map */
+	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
+	bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+	bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
+	if (bootmap_start == -1L) 
+		panic("Not enough continuous space for bootmap on node %d", nodeid); 
+	Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
+	
+	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+					 bootmap_start >> PAGE_SHIFT, 
+					 start_pfn, end_pfn); 
+
+	e820_bootmem_free(NODE_DATA(nodeid), start, end);
+
+	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
+	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+	node_set_online(nodeid);
+} 
+
+/* Initialize final allocator for a zone */
+void __init setup_node_zones(int nodeid)
+{ 
+	unsigned long start_pfn, end_pfn; 
+	unsigned long zones[MAX_NR_ZONES];
+	unsigned long dma_end_pfn;
+
+	memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 
+
+	start_pfn = node_start_pfn(nodeid);
+	end_pfn = node_end_pfn(nodeid);
+
+	Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
+	
+	/* All nodes > 0 have a zero length zone DMA */ 
+	dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 
+	if (start_pfn < dma_end_pfn) { 
+		zones[ZONE_DMA] = dma_end_pfn - start_pfn;
+		zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 
+	} else { 
+		zones[ZONE_NORMAL] = end_pfn - start_pfn; 
+	} 
+    
+	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
+			    start_pfn, NULL); 
+} 
+
+void __init numa_init_array(void)
+{
+	int rr, i;
+	/* There are unfortunately some poorly designed mainboards around
+	   that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+	   mapping. To avoid this fill in the mapping for all possible
+	   CPUs, as the number of CPUs is not known yet. 
+	   We round robin the existing nodes. */
+	rr = 0;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (cpu_to_node[i] != NUMA_NO_NODE)
+			continue;
+		rr = next_node(rr, node_online_map);
+		if (rr == MAX_NUMNODES)
+			rr = first_node(node_online_map);
+		cpu_to_node[i] = rr;
+		rr++; 
+	}
+
+	set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
+}
+
+#ifdef CONFIG_NUMA_EMU
+int numa_fake __initdata = 0;
+
+/* Numa emulation */
+static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+ 	int i;
+ 	struct node nodes[MAX_NUMNODES];
+ 	unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+
+ 	/* Kludge needed for the hash function */
+ 	if (hweight64(sz) > 1) {
+ 		unsigned long x = 1;
+ 		while ((x << 1) < sz)
+ 			x <<= 1;
+ 		if (x < sz/2)
+ 			printk("Numa emulation unbalanced. Complain to maintainer\n");
+ 		sz = x;
+ 	}
+
+ 	memset(&nodes,0,sizeof(nodes));
+ 	for (i = 0; i < numa_fake; i++) {
+ 		nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+ 		if (i == numa_fake-1)
+ 			sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
+ 		nodes[i].end = nodes[i].start + sz;
+ 		if (i != numa_fake-1)
+ 			nodes[i].end--;
+ 		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
+ 		       i,
+ 		       nodes[i].start, nodes[i].end,
+ 		       (nodes[i].end - nodes[i].start) >> 20);
+		node_set_online(i);
+ 	}
+ 	memnode_shift = compute_hash_shift(nodes, numa_fake);
+ 	if (memnode_shift < 0) {
+ 		memnode_shift = 0;
+ 		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
+ 		return -1;
+ 	}
+ 	for_each_online_node(i)
+ 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ 	numa_init_array();
+ 	return 0;
+}
+#endif
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{ 
+	int i;
+
+#ifdef CONFIG_NUMA_EMU
+	if (numa_fake && !numa_emulation(start_pfn, end_pfn))
+ 		return;
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+					  end_pfn << PAGE_SHIFT))
+ 		return;
+#endif
+
+#ifdef CONFIG_K8_NUMA
+	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
+		return;
+#endif
+	printk(KERN_INFO "%s\n",
+	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+	printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+	       start_pfn << PAGE_SHIFT,
+	       end_pfn << PAGE_SHIFT); 
+		/* setup dummy node covering all memory */ 
+	memnode_shift = 63; 
+	memnodemap[0] = 0;
+	nodes_clear(node_online_map);
+	node_set_online(0);
+	for (i = 0; i < NR_CPUS; i++)
+		cpu_to_node[i] = 0;
+	node_to_cpumask[0] = cpumask_of_cpu(0);
+	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+__init void numa_add_cpu(int cpu)
+{
+	/* BP is initialized elsewhere */
+	if (cpu) 
+		set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+} 
+
+unsigned long __init numa_free_all_bootmem(void) 
+{ 
+	int i;
+	unsigned long pages = 0;
+	for_each_online_node(i) {
+		pages += free_all_bootmem_node(NODE_DATA(i));
+	}
+	return pages;
+} 
+
+void __init paging_init(void)
+{ 
+	int i;
+	for_each_online_node(i) {
+		setup_node_zones(i); 
+	}
+} 
+
+/* [numa=off] */
+__init int numa_setup(char *opt) 
+{ 
+	if (!strncmp(opt,"off",3))
+		numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+	if(!strncmp(opt, "fake=", 5)) {
+		numa_fake = simple_strtoul(opt+5,NULL,0); ;
+		if (numa_fake >= MAX_NUMNODES)
+			numa_fake = MAX_NUMNODES;
+	}
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ 	if (!strncmp(opt,"noacpi",6))
+ 		acpi_numa = -1;
+#endif
+	return 1;
+} 
+
+EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(node_to_cpumask);
+EXPORT_SYMBOL(memnode_shift);
+EXPORT_SYMBOL(memnodemap);
+EXPORT_SYMBOL(node_data);
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
new file mode 100644
index 000000000000..94862e1ec032
--- /dev/null
+++ b/arch/x86_64/mm/pageattr.c
@@ -0,0 +1,235 @@
+/* 
+ * Copyright 2002 Andi Kleen, SuSE Labs. 
+ * Thanks to Ben LaHaise for precious feedback.
+ */ 
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+static inline pte_t *lookup_address(unsigned long address) 
+{ 
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	if (pgd_none(*pgd))
+		return NULL;
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return NULL; 
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return NULL; 
+	if (pmd_large(*pmd))
+		return (pte_t *)pmd;
+	pte = pte_offset_kernel(pmd, address);
+	if (pte && !pte_present(*pte))
+		pte = NULL; 
+	return pte;
+} 
+
+static struct page *split_large_page(unsigned long address, pgprot_t prot,
+				     pgprot_t ref_prot)
+{ 
+	int i; 
+	unsigned long addr;
+	struct page *base = alloc_pages(GFP_KERNEL, 0);
+	pte_t *pbase;
+	if (!base) 
+		return NULL;
+	address = __pa(address);
+	addr = address & LARGE_PAGE_MASK; 
+	pbase = (pte_t *)page_address(base);
+	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+		pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
+				   addr == address ? prot : ref_prot);
+	}
+	return base;
+} 
+
+
+static void flush_kernel_map(void *address) 
+{
+	if (0 && address && cpu_has_clflush) {
+		/* is this worth it? */ 
+		int i;
+		for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) 
+			asm volatile("clflush (%0)" :: "r" (address + i)); 
+	} else
+		asm volatile("wbinvd":::"memory"); 
+	if (address)
+		__flush_tlb_one(address);
+	else
+		__flush_tlb_all();
+}
+
+
+static inline void flush_map(unsigned long address)
+{	
+	on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
+}
+
+struct deferred_page { 
+	struct deferred_page *next; 
+	struct page *fpage;
+	unsigned long address;
+}; 
+static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
+
+static inline void save_page(unsigned long address, struct page *fpage)
+{
+	struct deferred_page *df;
+	df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); 
+	if (!df) {
+		flush_map(address);
+		__free_page(fpage);
+	} else { 
+		df->next = df_list;
+		df->fpage = fpage;
+		df->address = address;
+		df_list = df;
+	} 			
+}
+
+/* 
+ * No more special protections in this 2/4MB area - revert to a
+ * large page again. 
+ */
+static void revert_page(unsigned long address, pgprot_t ref_prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t large_pte;
+
+	pgd = pgd_offset_k(address);
+	BUG_ON(pgd_none(*pgd));
+	pud = pud_offset(pgd,address);
+	BUG_ON(pud_none(*pud));
+	pmd = pmd_offset(pud, address);
+	BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
+	pgprot_val(ref_prot) |= _PAGE_PSE;
+	large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
+	set_pte((pte_t *)pmd, large_pte);
+}      
+
+static int
+__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
+				   pgprot_t ref_prot)
+{ 
+	pte_t *kpte; 
+	struct page *kpte_page;
+	unsigned kpte_flags;
+	kpte = lookup_address(address);
+	if (!kpte) return 0;
+	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+	kpte_flags = pte_val(*kpte); 
+	if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
+		if ((kpte_flags & _PAGE_PSE) == 0) { 
+			set_pte(kpte, pfn_pte(pfn, prot));
+		} else {
+ 			/*
+ 			 * split_large_page will take the reference for this change_page_attr
+ 			 * on the split page.
+ 			 */
+			struct page *split = split_large_page(address, prot, ref_prot); 
+			if (!split)
+				return -ENOMEM;
+			set_pte(kpte,mk_pte(split, ref_prot));
+			kpte_page = split;
+		}	
+		get_page(kpte_page);
+	} else if ((kpte_flags & _PAGE_PSE) == 0) { 
+		set_pte(kpte, pfn_pte(pfn, ref_prot));
+		__put_page(kpte_page);
+	} else
+		BUG();
+
+	/* on x86-64 the direct mapping set at boot is not using 4k pages */
+ 	BUG_ON(PageReserved(kpte_page));
+
+	switch (page_count(kpte_page)) {
+ 	case 1:
+		save_page(address, kpte_page); 		     
+		revert_page(address, ref_prot);
+		break;
+ 	case 0:
+ 		BUG(); /* memleak and failed 2M page regeneration */
+ 	}
+	return 0;
+} 
+
+/*
+ * Change the page attributes of an page in the linear mapping.
+ *
+ * This should be used when a page is mapped with a different caching policy
+ * than write-back somewhere - some CPUs do not like it when mappings with
+ * different caching policies exist. This changes the page attributes of the
+ * in kernel linear mapping too.
+ * 
+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
+ * This function only deals with the kernel linear map.
+ * 
+ * Caller must call global_flush_tlb() after this.
+ */
+int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
+{
+	int err = 0; 
+	int i; 
+
+	down_write(&init_mm.mmap_sem);
+	for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
+		unsigned long pfn = __pa(address) >> PAGE_SHIFT;
+
+		err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
+		if (err) 
+			break; 
+		/* Handle kernel mapping too which aliases part of the
+		 * lowmem */
+		if (__pa(address) < KERNEL_TEXT_SIZE) {
+			unsigned long addr2;
+			pgprot_t prot2 = prot;
+			addr2 = __START_KERNEL_map + __pa(address);
+ 			pgprot_val(prot2) &= ~_PAGE_NX;
+			err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
+		} 
+	} 	
+	up_write(&init_mm.mmap_sem); 
+	return err;
+}
+
+/* Don't call this for MMIO areas that may not have a mem_map entry */
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+	return change_page_attr_addr(addr, numpages, prot);
+}
+
+void global_flush_tlb(void)
+{ 
+	struct deferred_page *df, *next_df;
+
+	down_read(&init_mm.mmap_sem);
+	df = xchg(&df_list, NULL);
+	up_read(&init_mm.mmap_sem);
+	if (!df)
+		return;
+	flush_map((df && !df->next) ? df->address : 0);
+	for (; df; df = next_df) { 
+		next_df = df->next;
+		if (df->fpage) 
+			__free_page(df->fpage);
+		kfree(df);
+	} 
+} 
+
+EXPORT_SYMBOL(change_page_attr);
+EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
new file mode 100644
index 000000000000..5d01b31472e1
--- /dev/null
+++ b/arch/x86_64/mm/srat.c
@@ -0,0 +1,217 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+
+static struct acpi_table_slit *acpi_slit;
+
+static nodemask_t nodes_parsed __initdata;
+static nodemask_t nodes_found __initdata;
+static struct node nodes[MAX_NUMNODES] __initdata;
+static __u8  pxm2node[256] = { [0 ... 255] = 0xff };
+
+static __init int setup_node(int pxm)
+{
+	unsigned node = pxm2node[pxm];
+	if (node == 0xff) {
+		if (nodes_weight(nodes_found) >= MAX_NUMNODES)
+			return -1;
+		node = first_unset_node(nodes_found); 
+		node_set(node, nodes_found);
+		pxm2node[pxm] = node;
+	}
+	return pxm2node[pxm];
+}
+
+static __init int conflicting_nodes(unsigned long start, unsigned long end)
+{
+	int i;
+	for_each_online_node(i) {
+		struct node *nd = &nodes[i];
+		if (nd->start == nd->end)
+			continue;
+		if (nd->end > start && nd->start < end)
+			return 1;
+		if (nd->end == end && nd->start == start)
+			return 1;
+	}
+	return -1;
+}
+
+static __init void cutoff_node(int i, unsigned long start, unsigned long end)
+{
+	struct node *nd = &nodes[i];
+	if (nd->start < start) {
+		nd->start = start;
+		if (nd->end < nd->start)
+			nd->start = nd->end;
+	}
+	if (nd->end > end) {
+		if (!(end & 0xfff))
+			end--;
+		nd->end = end;
+		if (nd->start > nd->end)
+			nd->start = nd->end;
+	}
+}
+
+static __init void bad_srat(void)
+{
+	printk(KERN_ERR "SRAT: SRAT not used.\n");
+	acpi_numa = -1;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return numa_off || acpi_numa < 0;
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+	acpi_slit = slit;
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
+{
+	int pxm, node;
+	if (srat_disabled() || pa->flags.enabled == 0)
+		return;
+	pxm = pa->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+	if (pa->apic_id >= NR_CPUS) {
+		printk(KERN_ERR "SRAT: lapic %u too large.\n",
+		       pa->apic_id);
+		bad_srat();
+		return;
+	}
+	cpu_to_node[pa->apic_id] = node;
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+	       pxm, pa->apic_id, node);
+}
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
+{
+	struct node *nd;
+	unsigned long start, end;
+	int node, pxm;
+	int i;
+
+	if (srat_disabled() || ma->flags.enabled == 0)
+		return;
+	/* hotplug bit is ignored for now */
+	pxm = ma->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+		bad_srat();
+		return;
+	}
+	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
+	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
+	i = conflicting_nodes(start, end);
+	if (i >= 0) {
+		printk(KERN_ERR
+		       "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n",
+		       pxm, start, end, i, nodes[i].start, nodes[i].end);
+		bad_srat();
+		return;
+	}
+	nd = &nodes[node];
+	if (!node_test_and_set(node, nodes_parsed)) {
+		nd->start = start;
+		nd->end = end;
+	} else {
+		if (start < nd->start)
+			nd->start = start;
+		if (nd->end < end)
+			nd->end = end;
+	}
+	if (!(nd->end & 0xfff))
+		nd->end--;
+	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
+	       nd->start, nd->end);
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+/* Use the information discovered above to actually set up the nodes. */
+int __init acpi_scan_nodes(unsigned long start, unsigned long end)
+{
+	int i;
+	if (acpi_numa <= 0)
+		return -1;
+	memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed));
+	if (memnode_shift < 0) {
+		printk(KERN_ERR
+		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
+		bad_srat();
+		return -1;
+	}
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		if (!node_isset(i, nodes_parsed))
+			continue;
+		cutoff_node(i, start, end);
+		if (nodes[i].start == nodes[i].end) { 
+			node_clear(i, nodes_parsed);
+			continue;
+		}
+		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+	}
+	for (i = 0; i < NR_CPUS; i++) { 
+		if (cpu_to_node[i] == NUMA_NO_NODE)
+			continue;
+		if (!node_isset(cpu_to_node[i], nodes_parsed))
+			cpu_to_node[i] = NUMA_NO_NODE; 
+	}
+	numa_init_array();
+	return 0;
+}
+
+int node_to_pxm(int n)
+{
+       int i;
+       if (pxm2node[n] == n)
+               return n;
+       for (i = 0; i < 256; i++)
+               if (pxm2node[i] == n)
+                       return i;
+       return 0;
+}
+
+int __node_distance(int a, int b)
+{
+	int index;
+
+	if (!acpi_slit)
+		return a == b ? 10 : 20;
+	index = acpi_slit->localities * node_to_pxm(a);
+	return acpi_slit->entry[index + node_to_pxm(b)];
+}
+
+EXPORT_SYMBOL(__node_distance);
diff --git a/arch/x86_64/oprofile/Kconfig b/arch/x86_64/oprofile/Kconfig
new file mode 100644
index 000000000000..5ade19801b97
--- /dev/null
+++ b/arch/x86_64/oprofile/Kconfig
@@ -0,0 +1,23 @@
+
+menu "Profiling support"
+	depends on EXPERIMENTAL
+
+config PROFILING
+	bool "Profiling support (EXPERIMENTAL)"
+	help
+	  Say Y here to enable the extended profiling support mechanisms used
+	  by profilers such as OProfile.
+	  
+
+config OPROFILE
+	tristate "OProfile system profiling (EXPERIMENTAL)"
+	depends on PROFILING
+	help
+	  OProfile is a profiling system capable of profiling the
+	  whole system, include the kernel, kernel modules, libraries,
+	  and applications.
+
+	  If unsure, say N.
+
+endmenu
+
diff --git a/arch/x86_64/oprofile/Makefile b/arch/x86_64/oprofile/Makefile
new file mode 100644
index 000000000000..6be32683e1bc
--- /dev/null
+++ b/arch/x86_64/oprofile/Makefile
@@ -0,0 +1,19 @@
+#
+# oprofile for x86-64.
+# Just reuse the one from i386. 
+#
+
+obj-$(CONFIG_OPROFILE) += oprofile.o
+ 
+DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
+	oprof.o cpu_buffer.o buffer_sync.o \
+	event_buffer.o oprofile_files.o \
+	oprofilefs.o oprofile_stats.o \
+	timer_int.o )
+
+OPROFILE-y := init.o backtrace.o
+OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
+				     op_model_ppro.o
+OPROFILE-$(CONFIG_X86_IO_APIC)    += nmi_timer_int.o 
+
+oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile
new file mode 100644
index 000000000000..37c92e841dec
--- /dev/null
+++ b/arch/x86_64/pci/Makefile
@@ -0,0 +1,24 @@
+#
+# Makefile for X86_64 specific PCI routines
+#
+# Reuse the i386 PCI subsystem
+#
+CFLAGS += -Iarch/i386/pci
+
+obj-y		:= i386.o
+obj-$(CONFIG_PCI_DIRECT)+= direct.o
+obj-y		+= fixup.o
+obj-$(CONFIG_ACPI_PCI)	+= acpi.o
+obj-y			+= legacy.o irq.o common.o
+# mmconfig has a 64bit special
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
+
+obj-$(CONFIG_NUMA)	+= k8-bus.o
+
+direct-y += ../../i386/pci/direct.o
+acpi-y   += ../../i386/pci/acpi.o
+legacy-y += ../../i386/pci/legacy.o
+irq-y    += ../../i386/pci/irq.o
+common-y += ../../i386/pci/common.o
+fixup-y  += ../../i386/pci/fixup.o
+i386-y  += ../../i386/pci/i386.o
diff --git a/arch/x86_64/pci/Makefile-BUS b/arch/x86_64/pci/Makefile-BUS
new file mode 100644
index 000000000000..291985f0d2e4
--- /dev/null
+++ b/arch/x86_64/pci/Makefile-BUS
@@ -0,0 +1,22 @@
+#
+# Makefile for X86_64 specific PCI routines
+#
+# Reuse the i386 PCI subsystem
+#
+CFLAGS += -I arch/i386/pci
+
+obj-y		:= i386.o
+obj-$(CONFIG_PCI_DIRECT)+= direct.o
+obj-y		+= fixup.o
+obj-$(CONFIG_ACPI_PCI)	+= acpi.o
+obj-y			+= legacy.o irq.o common.o
+# mmconfig has a 64bit special
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
+
+direct-y += ../../i386/pci/direct.o
+acpi-y   += ../../i386/pci/acpi.o
+legacy-y += ../../i386/pci/legacy.o
+irq-y    += ../../i386/pci/irq.o
+common-y += ../../i386/pci/common.o
+fixup-y  += ../../i386/pci/fixup.o
+i386-y  += ../../i386/pci/i386.o
diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c
new file mode 100644
index 000000000000..62349c78db57
--- /dev/null
+++ b/arch/x86_64/pci/k8-bus.c
@@ -0,0 +1,78 @@
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <asm/mpspec.h>
+#include <linux/cpumask.h>
+
+/*
+ * This discovers the pcibus <-> node mapping on AMD K8.
+ *
+ * RED-PEN need to call this again on PCI hotplug
+ * RED-PEN empty cpus get reported wrong
+ */
+
+#define NODE_ID_REGISTER 0x60
+#define NODE_ID(dword) (dword & 0x07)
+#define LDT_BUS_NUMBER_REGISTER_0 0x94
+#define LDT_BUS_NUMBER_REGISTER_1 0xB4
+#define LDT_BUS_NUMBER_REGISTER_2 0xD4
+#define NR_LDT_BUS_NUMBER_REGISTERS 3
+#define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF)
+#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF)
+#define PCI_DEVICE_ID_K8HTCONFIG 0x1100
+
+/**
+ * fill_mp_bus_to_cpumask()
+ * fills the mp_bus_to_cpumask array based according to the LDT Bus Number
+ * Registers found in the K8 northbridge
+ */
+__init static int
+fill_mp_bus_to_cpumask(void)
+{
+	struct pci_dev *nb_dev = NULL;
+	int i, j, printed;
+	u32 ldtbus, nid;
+	static int lbnr[3] = {
+		LDT_BUS_NUMBER_REGISTER_0,
+		LDT_BUS_NUMBER_REGISTER_1,
+		LDT_BUS_NUMBER_REGISTER_2
+	};
+
+	while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD,
+			PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) {
+		pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid);
+
+		for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) {
+			pci_read_config_dword(nb_dev, lbnr[i], &ldtbus);
+			/*
+			 * if there are no busses hanging off of the current
+			 * ldt link then both the secondary and subordinate
+			 * bus number fields are set to 0.
+			 */
+			if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0
+				&& SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) {
+				for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus);
+				     j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus);
+				     j++)
+					pci_bus_to_cpumask[j] =
+						node_to_cpumask(NODE_ID(nid));
+			}
+		}
+	}
+
+	/* quick sanity check */
+	printed = 0;
+	for (i = 0; i < 256; i++) {
+		if (cpus_empty(pci_bus_to_cpumask[i])) {
+			pci_bus_to_cpumask[i] = CPU_MASK_ALL;
+			if (printed)
+				continue;
+			printk(KERN_ERR
+			       "k8-bus.c: some busses have empty cpu mask\n");
+			printed = 1;
+		}
+	}
+
+	return 0;
+}
+
+fs_initcall(fill_mp_bus_to_cpumask);
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
new file mode 100644
index 000000000000..b693c232fd07
--- /dev/null
+++ b/arch/x86_64/pci/mmconfig.c
@@ -0,0 +1,104 @@
+/*
+ * mmconfig.c - Low-level direct PCI config space access via MMCONFIG
+ * 
+ * This is an 64bit optimized version that always keeps the full mmconfig
+ * space mapped. This allows lockless config space operation.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include "pci.h"
+
+#define MMCONFIG_APER_SIZE (256*1024*1024)
+
+/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
+u32 pci_mmcfg_base_addr;
+
+/* Static virtual mapping of the MMCONFIG aperture */
+char *pci_mmcfg_virt;
+
+static inline char *pci_dev_base(unsigned int bus, unsigned int devfn)
+{
+	return pci_mmcfg_virt + ((bus << 20) | (devfn << 12));
+}
+
+static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
+			  unsigned int devfn, int reg, int len, u32 *value)
+{
+	char *addr = pci_dev_base(bus, devfn); 
+
+	if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
+		return -EINVAL;
+
+	switch (len) {
+	case 1:
+		*value = readb(addr + reg);
+		break;
+	case 2:
+		*value = readw(addr + reg);
+		break;
+	case 4:
+		*value = readl(addr + reg);
+		break;
+	}
+
+	return 0;
+}
+
+static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
+			   unsigned int devfn, int reg, int len, u32 value)
+{
+	char *addr = pci_dev_base(bus,devfn);
+
+	if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
+		return -EINVAL;
+
+	switch (len) {
+	case 1:
+		writeb(value, addr + reg);
+		break;
+	case 2:
+		writew(value, addr + reg);
+		break;
+	case 4:
+		writel(value, addr + reg);
+		break;
+	}
+
+	return 0;
+}
+
+static struct pci_raw_ops pci_mmcfg = {
+	.read =		pci_mmcfg_read,
+	.write =	pci_mmcfg_write,
+};
+
+static int __init pci_mmcfg_init(void)
+{
+	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+		return 0;
+	if (!pci_mmcfg_base_addr)
+		return 0;
+
+	/* Kludge for now. Don't use mmconfig on AMD systems because
+	   those have some busses where mmconfig doesn't work,
+	   and we don't parse ACPI MCFG well enough to handle that. 
+	   Remove when proper handling is added. */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return 0; 
+
+	/* RED-PEN i386 doesn't do _nocache right now */
+	pci_mmcfg_virt = ioremap_nocache(pci_mmcfg_base_addr, MMCONFIG_APER_SIZE);
+	if (!pci_mmcfg_virt) { 
+		printk("PCI: Cannot map mmconfig aperture\n");
+		return 0;
+	}	
+
+	printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_base_addr);
+	raw_pci_ops = &pci_mmcfg;
+	pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+
+	return 0;
+}
+
+arch_initcall(pci_mmcfg_init);