arch/tile: finish enabling support for TILE-Gx 64-bit chip

This support was partially present in the existing code (look for "__tilegx__" ifdefs) but with this change you can build a working kernel using the TILE-Gx toolchain and ARCH=tilegx. Most of these files are new, generally adding a foo_64.c file where previously there was just a foo_32.c file. The ARCH=tilegx directive redirects to arch/tile, not arch/tilegx, using the existing SRCARCH mechanism in the top-level Makefile. Changes to existing files: - <asm/bitops.h> and <asm/bitops_32.h> changed to factor the include of <asm-generic/bitops/non-atomic.h> in the common header. - <asm/compat.h> and arch/tile/kernel/compat.c changed to remove the "const" markers I had put on compat_sys_execve() when trying to match some recent similar changes to the non-compat execve. It turns out the compat version wasn't "upgraded" to use const. - <asm/opcode-tile_64.h> and <asm/opcode_constants_64.h> were previously included accidentally, with the 32-bit contents. Now they have the proper 64-bit contents. Finally, I had to hack the existing hacky drivers/input/input-compat.h to add yet another "#ifdef" for INPUT_COMPAT_TEST (same as x86_64). Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> [drivers/input]
2024-11-18 23:54:26 +08:00 · 2011-05-04 14:38:26 -04:00 · 2011-05-04 14:38:26 -04:00 · 18aecc2b64
commit 18aecc2b64
parent be84cb4383
30 changed files with 9349 additions and 1373 deletions
--- a/5
+++ b/5
@ -220,6 +220,11 @@ ifeq ($(ARCH),sh64)
       SRCARCH := sh
 endif

+# Additional ARCH settings for tile
+ifeq ($(ARCH),tilegx)
+       SRCARCH := tile
+endif
+
 # Where to locate arch specific headers
 hdr-arch  := $(SRCARCH)

--- a/arch/tile/configs/tilegx_defconfig
+++ b/arch/tile/configs/tilegx_defconfig
--- a/arch/tile/include/arch/chip_tilegx.h
+++ b/arch/tile/include/arch/chip_tilegx.h
@ -0,0 +1,258 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * @file
+ * Global header file.
+ * This header file specifies defines for TILE-Gx.
+ */
+
+#ifndef __ARCH_CHIP_H__
+#define __ARCH_CHIP_H__
+
+/** Specify chip version.
+ * When possible, prefer the CHIP_xxx symbols below for future-proofing.
+ * This is intended for cross-compiling; native compilation should
+ * use the predefined __tile_chip__ symbol.
+ */
+#define TILE_CHIP 10
+
+/** Specify chip revision.
+ * This provides for the case of a respin of a particular chip type;
+ * the normal value for this symbol is "0".
+ * This is intended for cross-compiling; native compilation should
+ * use the predefined __tile_chip_rev__ symbol.
+ */
+#define TILE_CHIP_REV 0
+
+/** The name of this architecture. */
+#define CHIP_ARCH_NAME "tilegx"
+
+/** The ELF e_machine type for binaries for this chip. */
+#define CHIP_ELF_TYPE() EM_TILEGX
+
+/** The alternate ELF e_machine type for binaries for this chip. */
+#define CHIP_COMPAT_ELF_TYPE() 0x2597
+
+/** What is the native word size of the machine? */
+#define CHIP_WORD_SIZE() 64
+
+/** How many bits of a virtual address are used. Extra bits must be
+ * the sign extension of the low bits.
+ */
+#define CHIP_VA_WIDTH() 42
+
+/** How many bits are in a physical address? */
+#define CHIP_PA_WIDTH() 40
+
+/** Size of the L2 cache, in bytes. */
+#define CHIP_L2_CACHE_SIZE() 262144
+
+/** Log size of an L2 cache line in bytes. */
+#define CHIP_L2_LOG_LINE_SIZE() 6
+
+/** Size of an L2 cache line, in bytes. */
+#define CHIP_L2_LINE_SIZE() (1 << CHIP_L2_LOG_LINE_SIZE())
+
+/** Associativity of the L2 cache. */
+#define CHIP_L2_ASSOC() 8
+
+/** Size of the L1 data cache, in bytes. */
+#define CHIP_L1D_CACHE_SIZE() 32768
+
+/** Log size of an L1 data cache line in bytes. */
+#define CHIP_L1D_LOG_LINE_SIZE() 6
+
+/** Size of an L1 data cache line, in bytes. */
+#define CHIP_L1D_LINE_SIZE() (1 << CHIP_L1D_LOG_LINE_SIZE())
+
+/** Associativity of the L1 data cache. */
+#define CHIP_L1D_ASSOC() 2
+
+/** Size of the L1 instruction cache, in bytes. */
+#define CHIP_L1I_CACHE_SIZE() 32768
+
+/** Log size of an L1 instruction cache line in bytes. */
+#define CHIP_L1I_LOG_LINE_SIZE() 6
+
+/** Size of an L1 instruction cache line, in bytes. */
+#define CHIP_L1I_LINE_SIZE() (1 << CHIP_L1I_LOG_LINE_SIZE())
+
+/** Associativity of the L1 instruction cache. */
+#define CHIP_L1I_ASSOC() 2
+
+/** Stride with which flush instructions must be issued. */
+#define CHIP_FLUSH_STRIDE() CHIP_L2_LINE_SIZE()
+
+/** Stride with which inv instructions must be issued. */
+#define CHIP_INV_STRIDE() CHIP_L2_LINE_SIZE()
+
+/** Stride with which finv instructions must be issued. */
+#define CHIP_FINV_STRIDE() CHIP_L2_LINE_SIZE()
+
+/** Can the local cache coherently cache data that is homed elsewhere? */
+#define CHIP_HAS_COHERENT_LOCAL_CACHE() 1
+
+/** How many simultaneous outstanding victims can the L2 cache have? */
+#define CHIP_MAX_OUTSTANDING_VICTIMS() 128
+
+/** Does the TLB support the NC and NOALLOC bits? */
+#define CHIP_HAS_NC_AND_NOALLOC_BITS() 1
+
+/** Does the chip support hash-for-home caching? */
+#define CHIP_HAS_CBOX_HOME_MAP() 1
+
+/** Number of entries in the chip's home map tables. */
+#define CHIP_CBOX_HOME_MAP_SIZE() 128
+
+/** Do uncacheable requests miss in the cache regardless of whether
+ * there is matching data? */
+#define CHIP_HAS_ENFORCED_UNCACHEABLE_REQUESTS() 1
+
+/** Does the mf instruction wait for victims? */
+#define CHIP_HAS_MF_WAITS_FOR_VICTIMS() 0
+
+/** Does the chip have an "inv" instruction that doesn't also flush? */
+#define CHIP_HAS_INV() 1
+
+/** Does the chip have a "wh64" instruction? */
+#define CHIP_HAS_WH64() 1
+
+/** Does this chip have a 'dword_align' instruction? */
+#define CHIP_HAS_DWORD_ALIGN() 0
+
+/** Number of performance counters. */
+#define CHIP_PERFORMANCE_COUNTERS() 4
+
+/** Does this chip have auxiliary performance counters? */
+#define CHIP_HAS_AUX_PERF_COUNTERS() 1
+
+/** Is the CBOX_MSR1 SPR supported? */
+#define CHIP_HAS_CBOX_MSR1() 0
+
+/** Is the TILE_RTF_HWM SPR supported? */
+#define CHIP_HAS_TILE_RTF_HWM() 1
+
+/** Is the TILE_WRITE_PENDING SPR supported? */
+#define CHIP_HAS_TILE_WRITE_PENDING() 0
+
+/** Is the PROC_STATUS SPR supported? */
+#define CHIP_HAS_PROC_STATUS_SPR() 1
+
+/** Is the DSTREAM_PF SPR supported? */
+#define CHIP_HAS_DSTREAM_PF() 1
+
+/** Log of the number of mshims we have. */
+#define CHIP_LOG_NUM_MSHIMS() 2
+
+/** Are the bases of the interrupt vector areas fixed? */
+#define CHIP_HAS_FIXED_INTVEC_BASE() 0
+
+/** Are the interrupt masks split up into 2 SPRs? */
+#define CHIP_HAS_SPLIT_INTR_MASK() 0
+
+/** Is the cycle count split up into 2 SPRs? */
+#define CHIP_HAS_SPLIT_CYCLE() 0
+
+/** Does the chip have a static network? */
+#define CHIP_HAS_SN() 0
+
+/** Does the chip have a static network processor? */
+#define CHIP_HAS_SN_PROC() 0
+
+/** Size of the L1 static network processor instruction cache, in bytes. */
+/* #define CHIP_L1SNI_CACHE_SIZE() -- does not apply to chip 10 */
+
+/** Does the chip have DMA support in each tile? */
+#define CHIP_HAS_TILE_DMA() 0
+
+/** Does the chip have the second revision of the directly accessible
+ *  dynamic networks?  This encapsulates a number of characteristics,
+ *  including the absence of the catch-all, the absence of inline message
+ *  tags, the absence of support for network context-switching, and so on.
+ */
+#define CHIP_HAS_REV1_XDN() 1
+
+/** Does the chip have cmpexch and similar (fetchadd, exch, etc.)? */
+#define CHIP_HAS_CMPEXCH() 1
+
+/** Does the chip have memory-mapped I/O support? */
+#define CHIP_HAS_MMIO() 1
+
+/** Does the chip have post-completion interrupts? */
+#define CHIP_HAS_POST_COMPLETION_INTERRUPTS() 1
+
+/** Does the chip have native single step support? */
+#define CHIP_HAS_SINGLE_STEP() 1
+
+#ifndef __OPEN_SOURCE__  /* features only relevant to hypervisor-level code */
+
+/** How many entries are present in the instruction TLB? */
+#define CHIP_ITLB_ENTRIES() 16
+
+/** How many entries are present in the data TLB? */
+#define CHIP_DTLB_ENTRIES() 32
+
+/** How many MAF entries does the XAUI shim have? */
+#define CHIP_XAUI_MAF_ENTRIES() 32
+
+/** Does the memory shim have a source-id table? */
+#define CHIP_HAS_MSHIM_SRCID_TABLE() 0
+
+/** Does the L1 instruction cache clear on reset? */
+#define CHIP_HAS_L1I_CLEAR_ON_RESET() 1
+
+/** Does the chip come out of reset with valid coordinates on all tiles?
+ * Note that if defined, this also implies that the upper left is 1,1.
+ */
+#define CHIP_HAS_VALID_TILE_COORD_RESET() 1
+
+/** Does the chip have unified packet formats? */
+#define CHIP_HAS_UNIFIED_PACKET_FORMATS() 1
+
+/** Does the chip support write reordering? */
+#define CHIP_HAS_WRITE_REORDERING() 1
+
+/** Does the chip support Y-X routing as well as X-Y? */
+#define CHIP_HAS_Y_X_ROUTING() 1
+
+/** Is INTCTRL_3 managed with the correct MPL? */
+#define CHIP_HAS_INTCTRL_3_STATUS_FIX() 1
+
+/** Is it possible to configure the chip to be big-endian? */
+#define CHIP_HAS_BIG_ENDIAN_CONFIG() 1
+
+/** Is the CACHE_RED_WAY_OVERRIDDEN SPR supported? */
+#define CHIP_HAS_CACHE_RED_WAY_OVERRIDDEN() 0
+
+/** Is the DIAG_TRACE_WAY SPR supported? */
+#define CHIP_HAS_DIAG_TRACE_WAY() 0
+
+/** Is the MEM_STRIPE_CONFIG SPR supported? */
+#define CHIP_HAS_MEM_STRIPE_CONFIG() 1
+
+/** Are the TLB_PERF SPRs supported? */
+#define CHIP_HAS_TLB_PERF() 1
+
+/** Is the VDN_SNOOP_SHIM_CTL SPR supported? */
+#define CHIP_HAS_VDN_SNOOP_SHIM_CTL() 0
+
+/** Does the chip support rev1 DMA packets? */
+#define CHIP_HAS_REV1_DMA_PACKETS() 1
+
+/** Does the chip have an IPI shim? */
+#define CHIP_HAS_IPI() 1
+
+#endif /* !__OPEN_SOURCE__ */
+#endif /* __ARCH_CHIP_H__ */
--- a/arch/tile/include/arch/interrupts_64.h
+++ b/arch/tile/include/arch/interrupts_64.h
@ -0,0 +1,276 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef __ARCH_INTERRUPTS_H__
+#define __ARCH_INTERRUPTS_H__
+
+/** Mask for an interrupt. */
+#ifdef __ASSEMBLER__
+/* Note: must handle breaking interrupts into high and low words manually. */
+#define INT_MASK(intno) (1 << (intno))
+#else
+#define INT_MASK(intno) (1ULL << (intno))
+#endif
+
+
+/** Where a given interrupt executes */
+#define INTERRUPT_VECTOR(i, pl) (0xFC000000 + ((pl) << 24) + ((i) << 8))
+
+/** Where to store a vector for a given interrupt. */
+#define USER_INTERRUPT_VECTOR(i) INTERRUPT_VECTOR(i, 0)
+
+/** The base address of user-level interrupts. */
+#define USER_INTERRUPT_VECTOR_BASE INTERRUPT_VECTOR(0, 0)
+
+
+/** Additional synthetic interrupt. */
+#define INT_BREAKPOINT (63)
+
+#define INT_MEM_ERROR    0
+#define INT_SINGLE_STEP_3    1
+#define INT_SINGLE_STEP_2    2
+#define INT_SINGLE_STEP_1    3
+#define INT_SINGLE_STEP_0    4
+#define INT_IDN_COMPLETE    5
+#define INT_UDN_COMPLETE    6
+#define INT_ITLB_MISS    7
+#define INT_ILL    8
+#define INT_GPV    9
+#define INT_IDN_ACCESS   10
+#define INT_UDN_ACCESS   11
+#define INT_SWINT_3   12
+#define INT_SWINT_2   13
+#define INT_SWINT_1   14
+#define INT_SWINT_0   15
+#define INT_ILL_TRANS   16
+#define INT_UNALIGN_DATA   17
+#define INT_DTLB_MISS   18
+#define INT_DTLB_ACCESS   19
+#define INT_IDN_FIREWALL   20
+#define INT_UDN_FIREWALL   21
+#define INT_TILE_TIMER   22
+#define INT_AUX_TILE_TIMER   23
+#define INT_IDN_TIMER   24
+#define INT_UDN_TIMER   25
+#define INT_IDN_AVAIL   26
+#define INT_UDN_AVAIL   27
+#define INT_IPI_3   28
+#define INT_IPI_2   29
+#define INT_IPI_1   30
+#define INT_IPI_0   31
+#define INT_PERF_COUNT   32
+#define INT_AUX_PERF_COUNT   33
+#define INT_INTCTRL_3   34
+#define INT_INTCTRL_2   35
+#define INT_INTCTRL_1   36
+#define INT_INTCTRL_0   37
+#define INT_BOOT_ACCESS   38
+#define INT_WORLD_ACCESS   39
+#define INT_I_ASID   40
+#define INT_D_ASID   41
+#define INT_DOUBLE_FAULT   42
+
+#define NUM_INTERRUPTS 43
+
+#ifndef __ASSEMBLER__
+#define QUEUED_INTERRUPTS ( \
+    INT_MASK(INT_MEM_ERROR) | \
+    INT_MASK(INT_IDN_COMPLETE) | \
+    INT_MASK(INT_UDN_COMPLETE) | \
+    INT_MASK(INT_IDN_FIREWALL) | \
+    INT_MASK(INT_UDN_FIREWALL) | \
+    INT_MASK(INT_TILE_TIMER) | \
+    INT_MASK(INT_AUX_TILE_TIMER) | \
+    INT_MASK(INT_IDN_TIMER) | \
+    INT_MASK(INT_UDN_TIMER) | \
+    INT_MASK(INT_IDN_AVAIL) | \
+    INT_MASK(INT_UDN_AVAIL) | \
+    INT_MASK(INT_IPI_3) | \
+    INT_MASK(INT_IPI_2) | \
+    INT_MASK(INT_IPI_1) | \
+    INT_MASK(INT_IPI_0) | \
+    INT_MASK(INT_PERF_COUNT) | \
+    INT_MASK(INT_AUX_PERF_COUNT) | \
+    INT_MASK(INT_INTCTRL_3) | \
+    INT_MASK(INT_INTCTRL_2) | \
+    INT_MASK(INT_INTCTRL_1) | \
+    INT_MASK(INT_INTCTRL_0) | \
+    INT_MASK(INT_BOOT_ACCESS) | \
+    INT_MASK(INT_WORLD_ACCESS) | \
+    INT_MASK(INT_I_ASID) | \
+    INT_MASK(INT_D_ASID) | \
+    INT_MASK(INT_DOUBLE_FAULT) | \
+    0)
+#define NONQUEUED_INTERRUPTS ( \
+    INT_MASK(INT_SINGLE_STEP_3) | \
+    INT_MASK(INT_SINGLE_STEP_2) | \
+    INT_MASK(INT_SINGLE_STEP_1) | \
+    INT_MASK(INT_SINGLE_STEP_0) | \
+    INT_MASK(INT_ITLB_MISS) | \
+    INT_MASK(INT_ILL) | \
+    INT_MASK(INT_GPV) | \
+    INT_MASK(INT_IDN_ACCESS) | \
+    INT_MASK(INT_UDN_ACCESS) | \
+    INT_MASK(INT_SWINT_3) | \
+    INT_MASK(INT_SWINT_2) | \
+    INT_MASK(INT_SWINT_1) | \
+    INT_MASK(INT_SWINT_0) | \
+    INT_MASK(INT_ILL_TRANS) | \
+    INT_MASK(INT_UNALIGN_DATA) | \
+    INT_MASK(INT_DTLB_MISS) | \
+    INT_MASK(INT_DTLB_ACCESS) | \
+    0)
+#define CRITICAL_MASKED_INTERRUPTS ( \
+    INT_MASK(INT_MEM_ERROR) | \
+    INT_MASK(INT_SINGLE_STEP_3) | \
+    INT_MASK(INT_SINGLE_STEP_2) | \
+    INT_MASK(INT_SINGLE_STEP_1) | \
+    INT_MASK(INT_SINGLE_STEP_0) | \
+    INT_MASK(INT_IDN_COMPLETE) | \
+    INT_MASK(INT_UDN_COMPLETE) | \
+    INT_MASK(INT_IDN_FIREWALL) | \
+    INT_MASK(INT_UDN_FIREWALL) | \
+    INT_MASK(INT_TILE_TIMER) | \
+    INT_MASK(INT_AUX_TILE_TIMER) | \
+    INT_MASK(INT_IDN_TIMER) | \
+    INT_MASK(INT_UDN_TIMER) | \
+    INT_MASK(INT_IDN_AVAIL) | \
+    INT_MASK(INT_UDN_AVAIL) | \
+    INT_MASK(INT_IPI_3) | \
+    INT_MASK(INT_IPI_2) | \
+    INT_MASK(INT_IPI_1) | \
+    INT_MASK(INT_IPI_0) | \
+    INT_MASK(INT_PERF_COUNT) | \
+    INT_MASK(INT_AUX_PERF_COUNT) | \
+    INT_MASK(INT_INTCTRL_3) | \
+    INT_MASK(INT_INTCTRL_2) | \
+    INT_MASK(INT_INTCTRL_1) | \
+    INT_MASK(INT_INTCTRL_0) | \
+    0)
+#define CRITICAL_UNMASKED_INTERRUPTS ( \
+    INT_MASK(INT_ITLB_MISS) | \
+    INT_MASK(INT_ILL) | \
+    INT_MASK(INT_GPV) | \
+    INT_MASK(INT_IDN_ACCESS) | \
+    INT_MASK(INT_UDN_ACCESS) | \
+    INT_MASK(INT_SWINT_3) | \
+    INT_MASK(INT_SWINT_2) | \
+    INT_MASK(INT_SWINT_1) | \
+    INT_MASK(INT_SWINT_0) | \
+    INT_MASK(INT_ILL_TRANS) | \
+    INT_MASK(INT_UNALIGN_DATA) | \
+    INT_MASK(INT_DTLB_MISS) | \
+    INT_MASK(INT_DTLB_ACCESS) | \
+    INT_MASK(INT_BOOT_ACCESS) | \
+    INT_MASK(INT_WORLD_ACCESS) | \
+    INT_MASK(INT_I_ASID) | \
+    INT_MASK(INT_D_ASID) | \
+    INT_MASK(INT_DOUBLE_FAULT) | \
+    0)
+#define MASKABLE_INTERRUPTS ( \
+    INT_MASK(INT_MEM_ERROR) | \
+    INT_MASK(INT_SINGLE_STEP_3) | \
+    INT_MASK(INT_SINGLE_STEP_2) | \
+    INT_MASK(INT_SINGLE_STEP_1) | \
+    INT_MASK(INT_SINGLE_STEP_0) | \
+    INT_MASK(INT_IDN_COMPLETE) | \
+    INT_MASK(INT_UDN_COMPLETE) | \
+    INT_MASK(INT_IDN_FIREWALL) | \
+    INT_MASK(INT_UDN_FIREWALL) | \
+    INT_MASK(INT_TILE_TIMER) | \
+    INT_MASK(INT_AUX_TILE_TIMER) | \
+    INT_MASK(INT_IDN_TIMER) | \
+    INT_MASK(INT_UDN_TIMER) | \
+    INT_MASK(INT_IDN_AVAIL) | \
+    INT_MASK(INT_UDN_AVAIL) | \
+    INT_MASK(INT_IPI_3) | \
+    INT_MASK(INT_IPI_2) | \
+    INT_MASK(INT_IPI_1) | \
+    INT_MASK(INT_IPI_0) | \
+    INT_MASK(INT_PERF_COUNT) | \
+    INT_MASK(INT_AUX_PERF_COUNT) | \
+    INT_MASK(INT_INTCTRL_3) | \
+    INT_MASK(INT_INTCTRL_2) | \
+    INT_MASK(INT_INTCTRL_1) | \
+    INT_MASK(INT_INTCTRL_0) | \
+    0)
+#define UNMASKABLE_INTERRUPTS ( \
+    INT_MASK(INT_ITLB_MISS) | \
+    INT_MASK(INT_ILL) | \
+    INT_MASK(INT_GPV) | \
+    INT_MASK(INT_IDN_ACCESS) | \
+    INT_MASK(INT_UDN_ACCESS) | \
+    INT_MASK(INT_SWINT_3) | \
+    INT_MASK(INT_SWINT_2) | \
+    INT_MASK(INT_SWINT_1) | \
+    INT_MASK(INT_SWINT_0) | \
+    INT_MASK(INT_ILL_TRANS) | \
+    INT_MASK(INT_UNALIGN_DATA) | \
+    INT_MASK(INT_DTLB_MISS) | \
+    INT_MASK(INT_DTLB_ACCESS) | \
+    INT_MASK(INT_BOOT_ACCESS) | \
+    INT_MASK(INT_WORLD_ACCESS) | \
+    INT_MASK(INT_I_ASID) | \
+    INT_MASK(INT_D_ASID) | \
+    INT_MASK(INT_DOUBLE_FAULT) | \
+    0)
+#define SYNC_INTERRUPTS ( \
+    INT_MASK(INT_SINGLE_STEP_3) | \
+    INT_MASK(INT_SINGLE_STEP_2) | \
+    INT_MASK(INT_SINGLE_STEP_1) | \
+    INT_MASK(INT_SINGLE_STEP_0) | \
+    INT_MASK(INT_IDN_COMPLETE) | \
+    INT_MASK(INT_UDN_COMPLETE) | \
+    INT_MASK(INT_ITLB_MISS) | \
+    INT_MASK(INT_ILL) | \
+    INT_MASK(INT_GPV) | \
+    INT_MASK(INT_IDN_ACCESS) | \
+    INT_MASK(INT_UDN_ACCESS) | \
+    INT_MASK(INT_SWINT_3) | \
+    INT_MASK(INT_SWINT_2) | \
+    INT_MASK(INT_SWINT_1) | \
+    INT_MASK(INT_SWINT_0) | \
+    INT_MASK(INT_ILL_TRANS) | \
+    INT_MASK(INT_UNALIGN_DATA) | \
+    INT_MASK(INT_DTLB_MISS) | \
+    INT_MASK(INT_DTLB_ACCESS) | \
+    0)
+#define NON_SYNC_INTERRUPTS ( \
+    INT_MASK(INT_MEM_ERROR) | \
+    INT_MASK(INT_IDN_FIREWALL) | \
+    INT_MASK(INT_UDN_FIREWALL) | \
+    INT_MASK(INT_TILE_TIMER) | \
+    INT_MASK(INT_AUX_TILE_TIMER) | \
+    INT_MASK(INT_IDN_TIMER) | \
+    INT_MASK(INT_UDN_TIMER) | \
+    INT_MASK(INT_IDN_AVAIL) | \
+    INT_MASK(INT_UDN_AVAIL) | \
+    INT_MASK(INT_IPI_3) | \
+    INT_MASK(INT_IPI_2) | \
+    INT_MASK(INT_IPI_1) | \
+    INT_MASK(INT_IPI_0) | \
+    INT_MASK(INT_PERF_COUNT) | \
+    INT_MASK(INT_AUX_PERF_COUNT) | \
+    INT_MASK(INT_INTCTRL_3) | \
+    INT_MASK(INT_INTCTRL_2) | \
+    INT_MASK(INT_INTCTRL_1) | \
+    INT_MASK(INT_INTCTRL_0) | \
+    INT_MASK(INT_BOOT_ACCESS) | \
+    INT_MASK(INT_WORLD_ACCESS) | \
+    INT_MASK(INT_I_ASID) | \
+    INT_MASK(INT_D_ASID) | \
+    INT_MASK(INT_DOUBLE_FAULT) | \
+    0)
+#endif /* !__ASSEMBLER__ */
+#endif /* !__ARCH_INTERRUPTS_H__ */
--- a/arch/tile/include/arch/spr_def_64.h
+++ b/arch/tile/include/arch/spr_def_64.h
@ -0,0 +1,173 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef __DOXYGEN__
+
+#ifndef __ARCH_SPR_DEF_H__
+#define __ARCH_SPR_DEF_H__
+
+#define SPR_AUX_PERF_COUNT_0 0x2105
+#define SPR_AUX_PERF_COUNT_1 0x2106
+#define SPR_AUX_PERF_COUNT_CTL 0x2107
+#define SPR_AUX_PERF_COUNT_STS 0x2108
+#define SPR_CMPEXCH_VALUE 0x2780
+#define SPR_CYCLE 0x2781
+#define SPR_DONE 0x2705
+#define SPR_DSTREAM_PF 0x2706
+#define SPR_EVENT_BEGIN 0x2782
+#define SPR_EVENT_END 0x2783
+#define SPR_EX_CONTEXT_0_0 0x2580
+#define SPR_EX_CONTEXT_0_1 0x2581
+#define SPR_EX_CONTEXT_0_1__PL_SHIFT 0
+#define SPR_EX_CONTEXT_0_1__PL_RMASK 0x3
+#define SPR_EX_CONTEXT_0_1__PL_MASK  0x3
+#define SPR_EX_CONTEXT_0_1__ICS_SHIFT 2
+#define SPR_EX_CONTEXT_0_1__ICS_RMASK 0x1
+#define SPR_EX_CONTEXT_0_1__ICS_MASK  0x4
+#define SPR_EX_CONTEXT_1_0 0x2480
+#define SPR_EX_CONTEXT_1_1 0x2481
+#define SPR_EX_CONTEXT_1_1__PL_SHIFT 0
+#define SPR_EX_CONTEXT_1_1__PL_RMASK 0x3
+#define SPR_EX_CONTEXT_1_1__PL_MASK  0x3
+#define SPR_EX_CONTEXT_1_1__ICS_SHIFT 2
+#define SPR_EX_CONTEXT_1_1__ICS_RMASK 0x1
+#define SPR_EX_CONTEXT_1_1__ICS_MASK  0x4
+#define SPR_EX_CONTEXT_2_0 0x2380
+#define SPR_EX_CONTEXT_2_1 0x2381
+#define SPR_EX_CONTEXT_2_1__PL_SHIFT 0
+#define SPR_EX_CONTEXT_2_1__PL_RMASK 0x3
+#define SPR_EX_CONTEXT_2_1__PL_MASK  0x3
+#define SPR_EX_CONTEXT_2_1__ICS_SHIFT 2
+#define SPR_EX_CONTEXT_2_1__ICS_RMASK 0x1
+#define SPR_EX_CONTEXT_2_1__ICS_MASK  0x4
+#define SPR_FAIL 0x2707
+#define SPR_ILL_TRANS_REASON__I_STREAM_VA_RMASK 0x1
+#define SPR_INTCTRL_0_STATUS 0x2505
+#define SPR_INTCTRL_1_STATUS 0x2405
+#define SPR_INTCTRL_2_STATUS 0x2305
+#define SPR_INTERRUPT_CRITICAL_SECTION 0x2708
+#define SPR_INTERRUPT_MASK_0 0x2506
+#define SPR_INTERRUPT_MASK_1 0x2406
+#define SPR_INTERRUPT_MASK_2 0x2306
+#define SPR_INTERRUPT_MASK_RESET_0 0x2507
+#define SPR_INTERRUPT_MASK_RESET_1 0x2407
+#define SPR_INTERRUPT_MASK_RESET_2 0x2307
+#define SPR_INTERRUPT_MASK_SET_0 0x2508
+#define SPR_INTERRUPT_MASK_SET_1 0x2408
+#define SPR_INTERRUPT_MASK_SET_2 0x2308
+#define SPR_INTERRUPT_VECTOR_BASE_0 0x2509
+#define SPR_INTERRUPT_VECTOR_BASE_1 0x2409
+#define SPR_INTERRUPT_VECTOR_BASE_2 0x2309
+#define SPR_INTERRUPT_VECTOR_BASE_3 0x2209
+#define SPR_IPI_EVENT_0 0x1f05
+#define SPR_IPI_EVENT_1 0x1e05
+#define SPR_IPI_EVENT_2 0x1d05
+#define SPR_IPI_EVENT_RESET_0 0x1f06
+#define SPR_IPI_EVENT_RESET_1 0x1e06
+#define SPR_IPI_EVENT_RESET_2 0x1d06
+#define SPR_IPI_EVENT_SET_0 0x1f07
+#define SPR_IPI_EVENT_SET_1 0x1e07
+#define SPR_IPI_EVENT_SET_2 0x1d07
+#define SPR_IPI_MASK_0 0x1f08
+#define SPR_IPI_MASK_1 0x1e08
+#define SPR_IPI_MASK_2 0x1d08
+#define SPR_IPI_MASK_RESET_0 0x1f09
+#define SPR_IPI_MASK_RESET_1 0x1e09
+#define SPR_IPI_MASK_RESET_2 0x1d09
+#define SPR_IPI_MASK_SET_0 0x1f0a
+#define SPR_IPI_MASK_SET_1 0x1e0a
+#define SPR_IPI_MASK_SET_2 0x1d0a
+#define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
+#define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
+#define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
+#define SPR_MPL_INTCTRL_0_SET_0 0x2500
+#define SPR_MPL_INTCTRL_0_SET_1 0x2501
+#define SPR_MPL_INTCTRL_0_SET_2 0x2502
+#define SPR_MPL_INTCTRL_1_SET_0 0x2400
+#define SPR_MPL_INTCTRL_1_SET_1 0x2401
+#define SPR_MPL_INTCTRL_1_SET_2 0x2402
+#define SPR_MPL_INTCTRL_2_SET_0 0x2300
+#define SPR_MPL_INTCTRL_2_SET_1 0x2301
+#define SPR_MPL_INTCTRL_2_SET_2 0x2302
+#define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
+#define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
+#define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
+#define SPR_MPL_UDN_AVAIL_SET_0 0x1b00
+#define SPR_MPL_UDN_AVAIL_SET_1 0x1b01
+#define SPR_MPL_UDN_AVAIL_SET_2 0x1b02
+#define SPR_MPL_UDN_COMPLETE_SET_0 0x0600
+#define SPR_MPL_UDN_COMPLETE_SET_1 0x0601
+#define SPR_MPL_UDN_COMPLETE_SET_2 0x0602
+#define SPR_MPL_UDN_FIREWALL_SET_0 0x1500
+#define SPR_MPL_UDN_FIREWALL_SET_1 0x1501
+#define SPR_MPL_UDN_FIREWALL_SET_2 0x1502
+#define SPR_MPL_UDN_TIMER_SET_0 0x1900
+#define SPR_MPL_UDN_TIMER_SET_1 0x1901
+#define SPR_MPL_UDN_TIMER_SET_2 0x1902
+#define SPR_MPL_WORLD_ACCESS_SET_0 0x2700
+#define SPR_MPL_WORLD_ACCESS_SET_1 0x2701
+#define SPR_MPL_WORLD_ACCESS_SET_2 0x2702
+#define SPR_PASS 0x2709
+#define SPR_PERF_COUNT_0 0x2005
+#define SPR_PERF_COUNT_1 0x2006
+#define SPR_PERF_COUNT_CTL 0x2007
+#define SPR_PERF_COUNT_DN_CTL 0x2008
+#define SPR_PERF_COUNT_STS 0x2009
+#define SPR_PROC_STATUS 0x2784
+#define SPR_SIM_CONTROL 0x2785
+#define SPR_SINGLE_STEP_CONTROL_0 0x0405
+#define SPR_SINGLE_STEP_CONTROL_0__CANCELED_MASK  0x1
+#define SPR_SINGLE_STEP_CONTROL_0__INHIBIT_MASK  0x2
+#define SPR_SINGLE_STEP_CONTROL_1 0x0305
+#define SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK  0x1
+#define SPR_SINGLE_STEP_CONTROL_1__INHIBIT_MASK  0x2
+#define SPR_SINGLE_STEP_CONTROL_2 0x0205
+#define SPR_SINGLE_STEP_CONTROL_2__CANCELED_MASK  0x1
+#define SPR_SINGLE_STEP_CONTROL_2__INHIBIT_MASK  0x2
+#define SPR_SINGLE_STEP_EN_0_0 0x250a
+#define SPR_SINGLE_STEP_EN_0_1 0x240a
+#define SPR_SINGLE_STEP_EN_0_2 0x230a
+#define SPR_SINGLE_STEP_EN_1_0 0x250b
+#define SPR_SINGLE_STEP_EN_1_1 0x240b
+#define SPR_SINGLE_STEP_EN_1_2 0x230b
+#define SPR_SINGLE_STEP_EN_2_0 0x250c
+#define SPR_SINGLE_STEP_EN_2_1 0x240c
+#define SPR_SINGLE_STEP_EN_2_2 0x230c
+#define SPR_SYSTEM_SAVE_0_0 0x2582
+#define SPR_SYSTEM_SAVE_0_1 0x2583
+#define SPR_SYSTEM_SAVE_0_2 0x2584
+#define SPR_SYSTEM_SAVE_0_3 0x2585
+#define SPR_SYSTEM_SAVE_1_0 0x2482
+#define SPR_SYSTEM_SAVE_1_1 0x2483
+#define SPR_SYSTEM_SAVE_1_2 0x2484
+#define SPR_SYSTEM_SAVE_1_3 0x2485
+#define SPR_SYSTEM_SAVE_2_0 0x2382
+#define SPR_SYSTEM_SAVE_2_1 0x2383
+#define SPR_SYSTEM_SAVE_2_2 0x2384
+#define SPR_SYSTEM_SAVE_2_3 0x2385
+#define SPR_TILE_COORD 0x270b
+#define SPR_TILE_RTF_HWM 0x270c
+#define SPR_TILE_TIMER_CONTROL 0x1605
+#define SPR_UDN_AVAIL_EN 0x1b05
+#define SPR_UDN_DATA_AVAIL 0x0b80
+#define SPR_UDN_DEADLOCK_TIMEOUT 0x1906
+#define SPR_UDN_DEMUX_COUNT_0 0x0b05
+#define SPR_UDN_DEMUX_COUNT_1 0x0b06
+#define SPR_UDN_DEMUX_COUNT_2 0x0b07
+#define SPR_UDN_DEMUX_COUNT_3 0x0b08
+#define SPR_UDN_DIRECTION_PROTECT 0x1505
+
+#endif /* !defined(__ARCH_SPR_DEF_H__) */
+
+#endif /* !defined(__DOXYGEN__) */
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@ -0,0 +1,169 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Do not include directly; use <asm/atomic.h>.
+ */
+
+#ifndef _ASM_TILE_ATOMIC_64_H
+#define _ASM_TILE_ATOMIC_64_H
+
+#ifndef __ASSEMBLY__
+
+#include <arch/spr_def.h>
+
+/* First, the 32-bit atomic ops that are "real" on our 64-bit platform. */
+
+#define atomic_set(v, i) ((v)->counter = (i))
+
+/*
+ * The smp_mb() operations throughout are to support the fact that
+ * Linux requires memory barriers before and after the operation,
+ * on any routine which updates memory and returns a value.
+ */
+
+static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
+{
+	int val;
+	__insn_mtspr(SPR_CMPEXCH_VALUE, o);
+	smp_mb();  /* barrier for proper semantics */
+	val = __insn_cmpexch4((void *)&v->counter, n);
+	smp_mb();  /* barrier for proper semantics */
+	return val;
+}
+
+static inline int atomic_xchg(atomic_t *v, int n)
+{
+	int val;
+	smp_mb();  /* barrier for proper semantics */
+	val = __insn_exch4((void *)&v->counter, n);
+	smp_mb();  /* barrier for proper semantics */
+	return val;
+}
+
+static inline void atomic_add(int i, atomic_t *v)
+{
+	__insn_fetchadd4((void *)&v->counter, i);
+}
+
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+	int val;
+	smp_mb();  /* barrier for proper semantics */
+	val = __insn_fetchadd4((void *)&v->counter, i) + i;
+	barrier();  /* the "+ i" above will wait on memory */
+	return val;
+}
+
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int guess, oldval = v->counter;
+	do {
+		if (oldval == u)
+			break;
+		guess = oldval;
+		oldval = atomic_cmpxchg(v, guess, guess + a);
+	} while (guess != oldval);
+	return oldval != u;
+}
+
+/* Now the true 64-bit operations. */
+
+#define ATOMIC64_INIT(i)	{ (i) }
+
+#define atomic64_read(v)		((v)->counter)
+#define atomic64_set(v, i) ((v)->counter = (i))
+
+static inline long atomic64_cmpxchg(atomic64_t *v, long o, long n)
+{
+	long val;
+	smp_mb();  /* barrier for proper semantics */
+	__insn_mtspr(SPR_CMPEXCH_VALUE, o);
+	val = __insn_cmpexch((void *)&v->counter, n);
+	smp_mb();  /* barrier for proper semantics */
+	return val;
+}
+
+static inline long atomic64_xchg(atomic64_t *v, long n)
+{
+	long val;
+	smp_mb();  /* barrier for proper semantics */
+	val = __insn_exch((void *)&v->counter, n);
+	smp_mb();  /* barrier for proper semantics */
+	return val;
+}
+
+static inline void atomic64_add(long i, atomic64_t *v)
+{
+	__insn_fetchadd((void *)&v->counter, i);
+}
+
+static inline long atomic64_add_return(long i, atomic64_t *v)
+{
+	int val;
+	smp_mb();  /* barrier for proper semantics */
+	val = __insn_fetchadd((void *)&v->counter, i) + i;
+	barrier();  /* the "+ i" above will wait on memory */
+	return val;
+}
+
+static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
+{
+	long guess, oldval = v->counter;
+	do {
+		if (oldval == u)
+			break;
+		guess = oldval;
+		oldval = atomic64_cmpxchg(v, guess, guess + a);
+	} while (guess != oldval);
+	return oldval != u;
+}
+
+#define atomic64_sub_return(i, v)	atomic64_add_return(-(i), (v))
+#define atomic64_sub(i, v)		atomic64_add(-(i), (v))
+#define atomic64_inc_return(v)		atomic64_add_return(1, (v))
+#define atomic64_dec_return(v)		atomic64_sub_return(1, (v))
+#define atomic64_inc(v)			atomic64_add(1, (v))
+#define atomic64_dec(v)			atomic64_sub(1, (v))
+
+#define atomic64_inc_and_test(v)	(atomic64_inc_return(v) == 0)
+#define atomic64_dec_and_test(v)	(atomic64_dec_return(v) == 0)
+#define atomic64_sub_and_test(i, v)	(atomic64_sub_return((i), (v)) == 0)
+#define atomic64_add_negative(i, v)	(atomic64_add_return((i), (v)) < 0)
+
+#define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1, 0)
+
+/* Atomic dec and inc don't implement barrier, so provide them if needed. */
+#define smp_mb__before_atomic_dec()	smp_mb()
+#define smp_mb__after_atomic_dec()	smp_mb()
+#define smp_mb__before_atomic_inc()	smp_mb()
+#define smp_mb__after_atomic_inc()	smp_mb()
+
+#define xchg(ptr, x)							\
+	((typeof(*(ptr)))						\
+	 ((sizeof(*(ptr)) == sizeof(atomic_t)) ?			\
+	  atomic_xchg((atomic_t *)(ptr), (long)(x)) :			\
+	  (sizeof(*(ptr)) == sizeof(atomic_long_t)) ?			\
+	  atomic_long_xchg((atomic_long_t *)(ptr), (long)(x)) :		\
+	  __xchg_called_with_bad_pointer()))
+
+#define cmpxchg(ptr, o, n)						\
+	((typeof(*(ptr)))						\
+	 ((sizeof(*(ptr)) == sizeof(atomic_t)) ?			\
+	  atomic_cmpxchg((atomic_t *)(ptr), (long)(o), (long)(n)) :	\
+	  (sizeof(*(ptr)) == sizeof(atomic_long_t)) ?			\
+	  atomic_long_cmpxchg((atomic_long_t *)(ptr), (long)(o), (long)(n)) : \
+	  __cmpxchg_called_with_bad_pointer()))
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_TILE_ATOMIC_64_H */
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@ -122,6 +122,7 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
+#include <asm-generic/bitops/non-atomic.h>
 #include <asm-generic/bitops/le.h>

 #endif /* _ASM_TILE_BITOPS_H */
--- a/arch/tile/include/asm/bitops_32.h
+++ b/arch/tile/include/asm/bitops_32.h
@ -126,7 +126,6 @@ static inline int test_and_change_bit(unsigned nr,
 #define smp_mb__before_clear_bit()	smp_mb()
 #define smp_mb__after_clear_bit()	do {} while (0)

-#include <asm-generic/bitops/non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>

 #endif /* _ASM_TILE_BITOPS_32_H */
--- a/arch/tile/include/asm/bitops_64.h
+++ b/arch/tile/include/asm/bitops_64.h
@ -0,0 +1,105 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_BITOPS_64_H
+#define _ASM_TILE_BITOPS_64_H
+
+#include <linux/compiler.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+/* See <asm/bitops.h> for API comments. */
+
+static inline void set_bit(unsigned nr, volatile unsigned long *addr)
+{
+	unsigned long mask = (1UL << (nr % BITS_PER_LONG));
+	__insn_fetchor((void *)(addr + nr / BITS_PER_LONG), mask);
+}
+
+static inline void clear_bit(unsigned nr, volatile unsigned long *addr)
+{
+	unsigned long mask = (1UL << (nr % BITS_PER_LONG));
+	__insn_fetchand((void *)(addr + nr / BITS_PER_LONG), ~mask);
+}
+
+#define smp_mb__before_clear_bit()	smp_mb()
+#define smp_mb__after_clear_bit()	smp_mb()
+
+
+static inline void change_bit(unsigned nr, volatile unsigned long *addr)
+{
+	unsigned long old, mask = (1UL << (nr % BITS_PER_LONG));
+	long guess, oldval;
+	addr += nr / BITS_PER_LONG;
+	old = *addr;
+	do {
+		guess = oldval;
+		oldval = atomic64_cmpxchg((atomic64_t *)addr,
+					  guess, guess ^ mask);
+	} while (guess != oldval);
+}
+
+
+/*
+ * The test_and_xxx_bit() routines require a memory fence before we
+ * start the operation, and after the operation completes.  We use
+ * smp_mb() before, and rely on the "!= 0" comparison, plus a compiler
+ * barrier(), to block until the atomic op is complete.
+ */
+
+static inline int test_and_set_bit(unsigned nr, volatile unsigned long *addr)
+{
+	int val;
+	unsigned long mask = (1UL << (nr % BITS_PER_LONG));
+	smp_mb();  /* barrier for proper semantics */
+	val = (__insn_fetchor((void *)(addr + nr / BITS_PER_LONG), mask)
+	       & mask) != 0;
+	barrier();
+	return val;
+}
+
+
+static inline int test_and_clear_bit(unsigned nr, volatile unsigned long *addr)
+{
+	int val;
+	unsigned long mask = (1UL << (nr % BITS_PER_LONG));
+	smp_mb();  /* barrier for proper semantics */
+	val = (__insn_fetchand((void *)(addr + nr / BITS_PER_LONG), ~mask)
+	       & mask) != 0;
+	barrier();
+	return val;
+}
+
+
+static inline int test_and_change_bit(unsigned nr,
+				      volatile unsigned long *addr)
+{
+	unsigned long mask = (1UL << (nr % BITS_PER_LONG));
+	long guess, oldval = *addr;
+	addr += nr / BITS_PER_LONG;
+	oldval = *addr;
+	do {
+		guess = oldval;
+		oldval = atomic64_cmpxchg((atomic64_t *)addr,
+					  guess, guess ^ mask);
+	} while (guess != oldval);
+	return (oldval & mask) != 0;
+}
+
+#define ext2_set_bit_atomic(lock, nr, addr)			\
+	test_and_set_bit((nr), (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr)			\
+	test_and_clear_bit((nr), (unsigned long *)(addr))
+
+#endif /* _ASM_TILE_BITOPS_64_H */
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@ -215,8 +215,8 @@ struct compat_sigaction;
 struct compat_siginfo;
 struct compat_sigaltstack;
 long compat_sys_execve(const char __user *path,
-		       const compat_uptr_t __user *argv,
-		       const compat_uptr_t __user *envp, struct pt_regs *);
+		       compat_uptr_t __user *argv,
+		       compat_uptr_t __user *envp, struct pt_regs *);
 long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act,
 			     struct compat_sigaction __user *oact,
 			     size_t sigsetsize);
--- a/arch/tile/include/asm/opcode-tile_64.h
+++ b/arch/tile/include/asm/opcode-tile_64.h
--- a/arch/tile/include/asm/opcode_constants_64.h
+++ b/arch/tile/include/asm/opcode_constants_64.h
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@ -0,0 +1,175 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ */
+
+#ifndef _ASM_TILE_PGTABLE_64_H
+#define _ASM_TILE_PGTABLE_64_H
+
+/* The level-0 page table breaks the address space into 32-bit chunks. */
+#define PGDIR_SHIFT	HV_LOG2_L1_SPAN
+#define PGDIR_SIZE	HV_L1_SPAN
+#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+#define PTRS_PER_PGD	HV_L0_ENTRIES
+#define SIZEOF_PGD	(PTRS_PER_PGD * sizeof(pgd_t))
+
+/*
+ * The level-1 index is defined by the huge page size.  A PMD is composed
+ * of PTRS_PER_PMD pgd_t's and is the middle level of the page table.
+ */
+#define PMD_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
+#define PMD_SIZE	HV_PAGE_SIZE_LARGE
+#define PMD_MASK	(~(PMD_SIZE-1))
+#define PTRS_PER_PMD	(1 << (PGDIR_SHIFT - PMD_SHIFT))
+#define SIZEOF_PMD	(PTRS_PER_PMD * sizeof(pmd_t))
+
+/*
+ * The level-2 index is defined by the difference between the huge
+ * page size and the normal page size.  A PTE is composed of
+ * PTRS_PER_PTE pte_t's and is the bottom level of the page table.
+ * Note that the hypervisor docs use PTE for what we call pte_t, so
+ * this nomenclature is somewhat confusing.
+ */
+#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
+#define SIZEOF_PTE	(PTRS_PER_PTE * sizeof(pte_t))
+
+/*
+ * Align the vmalloc area to an L2 page table, and leave a guard page
+ * at the beginning and end.  The vmalloc code also puts in an internal
+ * guard page between each allocation.
+ */
+#define _VMALLOC_END	HUGE_VMAP_BASE
+#define VMALLOC_END	(_VMALLOC_END - PAGE_SIZE)
+#define VMALLOC_START	(_VMALLOC_START + PAGE_SIZE)
+
+#define HUGE_VMAP_END	(HUGE_VMAP_BASE + PGDIR_SIZE)
+
+#ifndef __ASSEMBLY__
+
+/* We have no pud since we are a three-level page table. */
+#include <asm-generic/pgtable-nopud.h>
+
+static inline int pud_none(pud_t pud)
+{
+	return pud_val(pud) == 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+	return pud_val(pud) & _PAGE_PRESENT;
+}
+
+#define pmd_ERROR(e) \
+	pr_err("%s:%d: bad pmd 0x%016llx.\n", __FILE__, __LINE__, pmd_val(e))
+
+static inline void pud_clear(pud_t *pudp)
+{
+	__pte_clear(&pudp->pgd);
+}
+
+static inline int pud_bad(pud_t pud)
+{
+	return ((pud_val(pud) & _PAGE_ALL) != _PAGE_TABLE);
+}
+
+/* Return the page-table frame number (ptfn) that a pud_t points at. */
+#define pud_ptfn(pud) hv_pte_get_ptfn((pud).pgd)
+
+/*
+ * A given kernel pud_t maps to a kernel pmd_t table at a specific
+ * virtual address.  Since kernel pmd_t tables can be aligned at
+ * sub-page granularity, this macro can return non-page-aligned
+ * pointers, despite its name.
+ */
+#define pud_page_vaddr(pud) \
+	(__va((phys_addr_t)pud_ptfn(pud) << HV_LOG2_PAGE_TABLE_ALIGN))
+
+/*
+ * A pud_t points to a pmd_t array.  Since we can have multiple per
+ * page, we don't have a one-to-one mapping of pud_t's to pages.
+ */
+#define pud_page(pud) pfn_to_page(HV_PTFN_TO_PFN(pud_ptfn(pud)))
+
+static inline unsigned long pud_index(unsigned long address)
+{
+	return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+#define pmd_offset(pud, address) \
+	((pmd_t *)pud_page_vaddr(*(pud)) + pmd_index(address))
+
+static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	set_pte(pmdp, pmdval);
+}
+
+/* Create a pmd from a PTFN and pgprot. */
+static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
+{
+	return hv_pte_set_ptfn(prot, ptfn);
+}
+
+/* Return the page-table frame number (ptfn) that a pmd_t points at. */
+static inline unsigned long pmd_ptfn(pmd_t pmd)
+{
+	return hv_pte_get_ptfn(pmd);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	__pte_clear(pmdp);
+}
+
+/* Normalize an address to having the correct high bits set. */
+#define pgd_addr_normalize pgd_addr_normalize
+static inline unsigned long pgd_addr_normalize(unsigned long addr)
+{
+	return ((long)addr << (CHIP_WORD_SIZE() - CHIP_VA_WIDTH())) >>
+		(CHIP_WORD_SIZE() - CHIP_VA_WIDTH());
+}
+
+/* We don't define any pgds for these addresses. */
+static inline int pgd_addr_invalid(unsigned long addr)
+{
+	return addr >= MEM_HV_START ||
+		(addr > MEM_LOW_END && addr < MEM_HIGH_START);
+}
+
+/*
+ * Use atomic instructions to provide atomicity against the hypervisor.
+ */
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long addr, pte_t *ptep)
+{
+	return (__insn_fetchand(&ptep->val, ~HV_PTE_ACCESSED) >>
+		HV_PTE_INDEX_ACCESSED) & 0x1;
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+				      unsigned long addr, pte_t *ptep)
+{
+	__insn_fetchand(&ptep->val, ~HV_PTE_WRITABLE);
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+				       unsigned long addr, pte_t *ptep)
+{
+	return hv_pte(__insn_exch(&ptep->val, 0UL));
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_TILE_PGTABLE_64_H */
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@ -0,0 +1,161 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * 64-bit SMP ticket spinlocks, allowing only a single CPU anywhere
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
+#ifndef _ASM_TILE_SPINLOCK_64_H
+#define _ASM_TILE_SPINLOCK_64_H
+
+/* Shifts and masks for the various fields in "lock". */
+#define __ARCH_SPIN_CURRENT_SHIFT	17
+#define __ARCH_SPIN_NEXT_MASK		0x7fff
+#define __ARCH_SPIN_NEXT_OVERFLOW	0x8000
+
+/*
+ * Return the "current" portion of a ticket lock value,
+ * i.e. the number that currently owns the lock.
+ */
+static inline int arch_spin_current(u32 val)
+{
+	return val >> __ARCH_SPIN_CURRENT_SHIFT;
+}
+
+/*
+ * Return the "next" portion of a ticket lock value,
+ * i.e. the number that the next task to try to acquire the lock will get.
+ */
+static inline int arch_spin_next(u32 val)
+{
+	return val & __ARCH_SPIN_NEXT_MASK;
+}
+
+/* The lock is locked if a task would have to wait to get it. */
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+	u32 val = lock->lock;
+	return arch_spin_current(val) != arch_spin_next(val);
+}
+
+/* Bump the current ticket so the next task owns the lock. */
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+	wmb();  /* guarantee anything modified under the lock is visible */
+	__insn_fetchadd4(&lock->lock, 1U << __ARCH_SPIN_CURRENT_SHIFT);
+}
+
+void arch_spin_unlock_wait(arch_spinlock_t *lock);
+
+void arch_spin_lock_slow(arch_spinlock_t *lock, u32 val);
+
+/* Grab the "next" ticket number and bump it atomically.
+ * If the current ticket is not ours, go to the slow path.
+ * We also take the slow path if the "next" value overflows.
+ */
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+	u32 val = __insn_fetchadd4(&lock->lock, 1);
+	u32 ticket = val & (__ARCH_SPIN_NEXT_MASK | __ARCH_SPIN_NEXT_OVERFLOW);
+	if (unlikely(arch_spin_current(val) != ticket))
+		arch_spin_lock_slow(lock, ticket);
+}
+
+/* Try to get the lock, and return whether we succeeded. */
+int arch_spin_trylock(arch_spinlock_t *lock);
+
+/* We cannot take an interrupt after getting a ticket, so don't enable them. */
+#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * We use fetchadd() for readers, and fetchor() with the sign bit
+ * for writers.
+ */
+
+#define __WRITE_LOCK_BIT (1 << 31)
+
+static inline int arch_write_val_locked(int val)
+{
+	return val < 0;  /* Optimize "val & __WRITE_LOCK_BIT". */
+}
+
+/**
+ * read_can_lock - would read_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int arch_read_can_lock(arch_rwlock_t *rw)
+{
+	return !arch_write_val_locked(rw->lock);
+}
+
+/**
+ * write_can_lock - would write_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int arch_write_can_lock(arch_rwlock_t *rw)
+{
+	return rw->lock == 0;
+}
+
+extern void __read_lock_failed(arch_rwlock_t *rw);
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+	u32 val = __insn_fetchaddgez4(&rw->lock, 1);
+	if (unlikely(arch_write_val_locked(val)))
+		__read_lock_failed(rw);
+}
+
+extern void __write_lock_failed(arch_rwlock_t *rw, u32 val);
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+	u32 val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
+	if (unlikely(val != 0))
+		__write_lock_failed(rw, val);
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+	__insn_mf();
+	__insn_fetchadd4(&rw->lock, -1);
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+	__insn_mf();
+	rw->lock = 0;
+}
+
+static inline int arch_read_trylock(arch_rwlock_t *rw)
+{
+	return !arch_write_val_locked(__insn_fetchaddgez4(&rw->lock, 1));
+}
+
+static inline int arch_write_trylock(arch_rwlock_t *rw)
+{
+	u32 val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
+	if (likely(val == 0))
+		return 1;
+	if (!arch_write_val_locked(val))
+		__insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
+	return 0;
+}
+
+#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
+#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
+
+#endif /* _ASM_TILE_SPINLOCK_64_H */
--- a/arch/tile/kernel/futex_64.S
+++ b/arch/tile/kernel/futex_64.S
@ -0,0 +1,55 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Atomically access user memory, but use MMU to avoid propagating
+ * kernel exceptions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/futex.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+
+/*
+ * Provide a set of atomic memory operations supporting <asm/futex.h>.
+ *
+ * r0: user address to manipulate
+ * r1: new value to write, or for cmpxchg, old value to compare against
+ * r2: (cmpxchg only) new value to write
+ *
+ * Return __get_user struct, r0 with value, r1 with error.
+ */
+#define FUTEX_OP(name, ...) \
+STD_ENTRY(futex_##name)			\
+	__VA_ARGS__;			\
+	{				\
+	 move   r1, zero;		\
+	 jrp    lr			\
+	};				\
+	STD_ENDPROC(futex_##name);	\
+	.pushsection __ex_table,"a";	\
+	.quad 1b, get_user_fault;	\
+	.popsection
+
+	.pushsection .fixup,"ax"
+get_user_fault:
+	{ movei r1, -EFAULT; jrp lr }
+	ENDPROC(get_user_fault)
+	.popsection
+
+FUTEX_OP(cmpxchg, mtspr CMPEXCH_VALUE, r1; 1: cmpexch4 r0, r0, r2)
+FUTEX_OP(set, 1: exch4 r0, r0, r1)
+FUTEX_OP(add, 1: fetchadd4 r0, r0, r1)
+FUTEX_OP(or, 1: fetchor4 r0, r0, r1)
+FUTEX_OP(andn, nor r1, r1, zero; 1: fetchand4 r0, r0, r1)
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@ -0,0 +1,269 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE startup code.
+ */
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/thread_info.h>
+#include <asm/processor.h>
+#include <asm/asm-offsets.h>
+#include <hv/hypervisor.h>
+#include <arch/chip.h>
+#include <arch/spr_def.h>
+
+/*
+ * This module contains the entry code for kernel images. It performs the
+ * minimal setup needed to call the generic C routines.
+ */
+
+	__HEAD
+ENTRY(_start)
+	/* Notify the hypervisor of what version of the API we want */
+	{
+	  movei r1, TILE_CHIP
+	  movei r2, TILE_CHIP_REV
+	}
+	{
+	  moveli r0, _HV_VERSION
+	  jal hv_init
+	}
+	/* Get a reasonable default ASID in r0 */
+	{
+	  move r0, zero
+	  jal hv_inquire_asid
+	}
+
+	/*
+	 * Install the default page table.  The relocation required to
+	 * statically define the table is a bit too complex, so we have
+	 * to plug in the pointer from the L0 to the L1 table by hand.
+	 * We only do this on the first cpu to boot, though, since the
+	 * other CPUs should see a properly-constructed page table.
+	 */
+	{
+	  v4int_l r2, zero, r0    /* ASID for hv_install_context */
+	  moveli r4, hw1_last(swapper_pgprot - PAGE_OFFSET)
+	}
+	{
+	  shl16insli r4, r4, hw0(swapper_pgprot - PAGE_OFFSET)
+	}
+	{
+	  ld r1, r4               /* access_pte for hv_install_context */
+	}
+	{
+	  moveli r0, hw1_last(.Lsv_data_pmd - PAGE_OFFSET)
+	  moveli r6, hw1_last(temp_data_pmd - PAGE_OFFSET)
+	}
+	{
+	  /* After initializing swapper_pgprot, HV_PTE_GLOBAL is set. */
+	  bfextu r7, r1, HV_PTE_INDEX_GLOBAL, HV_PTE_INDEX_GLOBAL
+	  inv r4
+	}
+	bnez r7, .Lno_write
+	{
+	  shl16insli r0, r0, hw0(.Lsv_data_pmd - PAGE_OFFSET)
+	  shl16insli r6, r6, hw0(temp_data_pmd - PAGE_OFFSET)
+	}
+	{
+	  /* Cut off the low bits of the PT address. */
+	  shrui r6, r6, HV_LOG2_PAGE_TABLE_ALIGN
+	  /* Start with our access pte. */
+	  move r5, r1
+	}
+	{
+	  /* Stuff the address into the page table pointer slot of the PTE. */
+	  bfins r5, r6, HV_PTE_INDEX_PTFN, \
+			HV_PTE_INDEX_PTFN + HV_PTE_PTFN_BITS - 1
+	}
+	{
+	  /* Store the L0 data PTE. */
+	  st r0, r5
+	  addli r6, r6, (temp_code_pmd - temp_data_pmd) >> \
+			HV_LOG2_PAGE_TABLE_ALIGN
+	}
+	{
+	  addli r0, r0, .Lsv_code_pmd - .Lsv_data_pmd
+	  bfins r5, r6, HV_PTE_INDEX_PTFN, \
+			HV_PTE_INDEX_PTFN + HV_PTE_PTFN_BITS - 1
+	}
+	/* Store the L0 code PTE. */
+	st r0, r5
+
+.Lno_write:
+	moveli lr, hw2_last(1f)
+	{
+	  shl16insli lr, lr, hw1(1f)
+	  moveli r0, hw1_last(swapper_pg_dir - PAGE_OFFSET)
+	}
+	{
+	  shl16insli lr, lr, hw0(1f)
+	  shl16insli r0, r0, hw0(swapper_pg_dir - PAGE_OFFSET)
+	}
+	{
+	  move r3, zero
+	  j hv_install_context
+	}
+1:
+
+	/* Install the interrupt base. */
+	moveli r0, hw2_last(MEM_SV_START)
+	shl16insli r0, r0, hw1(MEM_SV_START)
+	shl16insli r0, r0, hw0(MEM_SV_START)
+	mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
+
+	/*
+	 * Get our processor number and save it away in SAVE_K_0.
+	 * Extract stuff from the topology structure: r4 = y, r6 = x,
+	 * r5 = width.  FIXME: consider whether we want to just make these
+	 * 64-bit values (and if so fix smp_topology write below, too).
+	 */
+	jal hv_inquire_topology
+	{
+	  v4int_l r5, zero, r1    /* r5 = width */
+	  shrui r4, r0, 32        /* r4 = y */
+	}
+	{
+	  v4int_l r6, zero, r0    /* r6 = x */
+	  mul_lu_lu r4, r4, r5
+	}
+	{
+	  add r4, r4, r6          /* r4 == cpu == y*width + x */
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * Load up our per-cpu offset.  When the first (master) tile
+	 * boots, this value is still zero, so we will load boot_pc
+	 * with start_kernel, and boot_sp with init_stack + THREAD_SIZE.
+	 * The master tile initializes the per-cpu offset array, so that
+	 * when subsequent (secondary) tiles boot, they will instead load
+	 * from their per-cpu versions of boot_sp and boot_pc.
+	 */
+	moveli r5, hw2_last(__per_cpu_offset)
+	shl16insli r5, r5, hw1(__per_cpu_offset)
+	shl16insli r5, r5, hw0(__per_cpu_offset)
+	shl3add r5, r4, r5
+	ld r5, r5
+	bnez r5, 1f
+
+	/*
+	 * Save the width and height to the smp_topology variable
+	 * for later use.
+	 */
+	moveli r0, hw2_last(smp_topology + HV_TOPOLOGY_WIDTH_OFFSET)
+	shl16insli r0, r0, hw1(smp_topology + HV_TOPOLOGY_WIDTH_OFFSET)
+	shl16insli r0, r0, hw0(smp_topology + HV_TOPOLOGY_WIDTH_OFFSET)
+	st r0, r1
+1:
+#else
+	move r5, zero
+#endif
+
+	/* Load and go with the correct pc and sp. */
+	{
+	  moveli r1, hw2_last(boot_sp)
+	  moveli r0, hw2_last(boot_pc)
+	}
+	{
+	  shl16insli r1, r1, hw1(boot_sp)
+	  shl16insli r0, r0, hw1(boot_pc)
+	}
+	{
+	  shl16insli r1, r1, hw0(boot_sp)
+	  shl16insli r0, r0, hw0(boot_pc)
+	}
+	{
+	  add r1, r1, r5
+	  add r0, r0, r5
+	}
+	ld r0, r0
+	ld sp, r1
+	or r4, sp, r4
+	mtspr SPR_SYSTEM_SAVE_K_0, r4  /* save ksp0 + cpu */
+	addi sp, sp, -STACK_TOP_DELTA
+	{
+	  move lr, zero   /* stop backtraces in the called function */
+	  jr r0
+	}
+	ENDPROC(_start)
+
+__PAGE_ALIGNED_BSS
+	.align PAGE_SIZE
+ENTRY(empty_zero_page)
+	.fill PAGE_SIZE,1,0
+	END(empty_zero_page)
+
+	.macro PTE cpa, bits1
+	.quad HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED |\
+	      HV_PTE_GLOBAL | (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE) |\
+	      (\bits1) | (HV_CPA_TO_PFN(\cpa) << HV_PTE_INDEX_PFN)
+	.endm
+
+__PAGE_ALIGNED_DATA
+	.align PAGE_SIZE
+ENTRY(swapper_pg_dir)
+	.org swapper_pg_dir + HV_L0_INDEX(PAGE_OFFSET) * HV_PTE_SIZE
+.Lsv_data_pmd:
+	.quad 0  /* PTE temp_data_pmd - PAGE_OFFSET, 0 */
+	.org swapper_pg_dir + HV_L0_INDEX(MEM_SV_START) * HV_PTE_SIZE
+.Lsv_code_pmd:
+	.quad 0  /* PTE temp_code_pmd - PAGE_OFFSET, 0 */
+	.org swapper_pg_dir + HV_L0_SIZE
+	END(swapper_pg_dir)
+
+	.align HV_PAGE_TABLE_ALIGN
+ENTRY(temp_data_pmd)
+	/*
+	 * We fill the PAGE_OFFSET pmd with huge pages with
+	 * VA = PA + PAGE_OFFSET.  We remap things with more precise access
+	 * permissions later.
+	 */
+	.set addr, 0
+	.rept HV_L1_ENTRIES
+	PTE addr, HV_PTE_READABLE | HV_PTE_WRITABLE
+	.set addr, addr + HV_PAGE_SIZE_LARGE
+	.endr
+	.org temp_data_pmd + HV_L1_SIZE
+	END(temp_data_pmd)
+
+	.align HV_PAGE_TABLE_ALIGN
+ENTRY(temp_code_pmd)
+	/*
+	 * We fill the MEM_SV_START pmd with huge pages with
+	 * VA = PA + PAGE_OFFSET.  We remap things with more precise access
+	 * permissions later.
+	 */
+	.set addr, 0
+	.rept HV_L1_ENTRIES
+	PTE addr, HV_PTE_READABLE | HV_PTE_EXECUTABLE
+	.set addr, addr + HV_PAGE_SIZE_LARGE
+	.endr
+	.org temp_code_pmd + HV_L1_SIZE
+	END(temp_code_pmd)
+
+	/*
+	 * Isolate swapper_pgprot to its own cache line, since each cpu
+	 * starting up will read it using VA-is-PA and local homing.
+	 * This would otherwise likely conflict with other data on the cache
+	 * line, once we have set its permanent home in the page tables.
+	 */
+	__INITDATA
+	.align CHIP_L2_LINE_SIZE()
+ENTRY(swapper_pgprot)
+	.quad HV_PTE_PRESENT | (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE)
+	.align CHIP_L2_LINE_SIZE()
+	END(swapper_pgprot)
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@ -630,8 +630,8 @@ out:

 #ifdef CONFIG_COMPAT
 long compat_sys_execve(const char __user *path,
-		       const compat_uptr_t __user *argv,
-		       const compat_uptr_t __user *envp,
+		       compat_uptr_t __user *argv,
+		       compat_uptr_t __user *envp,
 		       struct pt_regs *regs)
 {
 	long error;
--- a/arch/tile/kernel/regs_64.S
+++ b/arch/tile/kernel/regs_64.S
@ -0,0 +1,145 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/system.h>
+#include <asm/ptrace.h>
+#include <asm/asm-offsets.h>
+#include <arch/spr_def.h>
+#include <asm/processor.h>
+
+/*
+ * See <asm/system.h>; called with prev and next task_struct pointers.
+ * "prev" is returned in r0 for _switch_to and also for ret_from_fork.
+ *
+ * We want to save pc/sp in "prev", and get the new pc/sp from "next".
+ * We also need to save all the callee-saved registers on the stack.
+ *
+ * Intel enables/disables access to the hardware cycle counter in
+ * seccomp (secure computing) environments if necessary, based on
+ * has_secure_computing().  We might want to do this at some point,
+ * though it would require virtualizing the other SPRs under WORLD_ACCESS.
+ *
+ * Since we're saving to the stack, we omit sp from this list.
+ * And for parallels with other architectures, we save lr separately,
+ * in the thread_struct itself (as the "pc" field).
+ *
+ * This code also needs to be aligned with process.c copy_thread()
+ */
+
+#if CALLEE_SAVED_REGS_COUNT != 24
+# error Mismatch between <asm/system.h> and kernel/entry.S
+#endif
+#define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 8)
+
+#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 }
+#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 }
+#define FOR_EACH_CALLEE_SAVED_REG(f)					\
+							f(r30); f(r31); \
+	f(r32); f(r33); f(r34); f(r35);	f(r36); f(r37); f(r38); f(r39); \
+	f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \
+	f(r48); f(r49); f(r50); f(r51); f(r52);
+
+STD_ENTRY_SECTION(__switch_to, .sched.text)
+	{
+	  move r10, sp
+	  st sp, lr
+	}
+	{
+	  addli r11, sp, -FRAME_SIZE + 8
+	  addli sp, sp, -FRAME_SIZE
+	}
+	{
+	  st r11, r10
+	  addli r4, r1, TASK_STRUCT_THREAD_KSP_OFFSET
+	}
+	{
+	  ld r13, r4   /* Load new sp to a temp register early. */
+	  addi r12, sp, 16
+	}
+	FOR_EACH_CALLEE_SAVED_REG(SAVE_REG)
+	addli r3, r0, TASK_STRUCT_THREAD_KSP_OFFSET
+	{
+	  st r3, sp
+	  addli r3, r0, TASK_STRUCT_THREAD_PC_OFFSET
+	}
+	{
+	  st r3, lr
+	  addli r4, r1, TASK_STRUCT_THREAD_PC_OFFSET
+	}
+	{
+	  ld lr, r4
+	  addi r12, r13, 16
+	}
+	{
+	  /* Update sp and ksp0 simultaneously to avoid backtracer warnings. */
+	  move sp, r13
+	  mtspr SPR_SYSTEM_SAVE_K_0, r2
+	}
+	FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
+.L__switch_to_pc:
+	{
+	  addli sp, sp, FRAME_SIZE
+	  jrp lr   /* r0 is still valid here, so return it */
+	}
+	STD_ENDPROC(__switch_to)
+
+/* Return a suitable address for the backtracer for suspended threads */
+STD_ENTRY_SECTION(get_switch_to_pc, .sched.text)
+	lnk r0
+	{
+	  addli r0, r0, .L__switch_to_pc - .
+	  jrp lr
+	}
+	STD_ENDPROC(get_switch_to_pc)
+
+STD_ENTRY(get_pt_regs)
+	.irp reg, r0, r1, r2, r3, r4, r5, r6, r7, \
+		 r8, r9, r10, r11, r12, r13, r14, r15, \
+		 r16, r17, r18, r19, r20, r21, r22, r23, \
+		 r24, r25, r26, r27, r28, r29, r30, r31, \
+		 r32, r33, r34, r35, r36, r37, r38, r39, \
+		 r40, r41, r42, r43, r44, r45, r46, r47, \
+		 r48, r49, r50, r51, r52, tp, sp
+	{
+	 st r0, \reg
+	 addi r0, r0, 8
+	}
+	.endr
+	{
+	 st r0, lr
+	 addi r0, r0, PTREGS_OFFSET_PC - PTREGS_OFFSET_LR
+	}
+	lnk r1
+	{
+	 st r0, r1
+	 addi r0, r0, PTREGS_OFFSET_EX1 - PTREGS_OFFSET_PC
+	}
+	mfspr r1, INTERRUPT_CRITICAL_SECTION
+	shli r1, r1, SPR_EX_CONTEXT_1_1__ICS_SHIFT
+	ori r1, r1, KERNEL_PL
+	{
+	 st r0, r1
+	 addi r0, r0, PTREGS_OFFSET_FAULTNUM - PTREGS_OFFSET_EX1
+	}
+	{
+	 st r0, zero       /* clear faultnum */
+	 addi r0, r0, PTREGS_OFFSET_ORIG_R0 - PTREGS_OFFSET_FAULTNUM
+	}
+	{
+	 st r0, zero       /* clear orig_r0 */
+	 addli r0, r0, -PTREGS_OFFSET_ORIG_R0    /* restore r0 to base */
+	}
+	jrp lr
+	STD_ENDPROC(get_pt_regs)
--- a/arch/tile/kernel/tile-desc_64.c
+++ b/arch/tile/kernel/tile-desc_64.c
--- a/arch/tile/lib/memchr_64.c
+++ b/arch/tile/lib/memchr_64.c
@ -0,0 +1,71 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+void *memchr(const void *s, int c, size_t n)
+{
+	const uint64_t *last_word_ptr;
+	const uint64_t *p;
+	const char *last_byte_ptr;
+	uintptr_t s_int;
+	uint64_t goal, before_mask, v, bits;
+	char *ret;
+
+	if (__builtin_expect(n == 0, 0)) {
+		/* Don't dereference any memory if the array is empty. */
+		return NULL;
+	}
+
+	/* Get an aligned pointer. */
+	s_int = (uintptr_t) s;
+	p = (const uint64_t *)(s_int & -8);
+
+	/* Create eight copies of the byte for which we are looking. */
+	goal = 0x0101010101010101ULL * (uint8_t) c;
+
+	/* Read the first word, but munge it so that bytes before the array
+	 * will not match goal.
+	 *
+	 * Note that this shift count expression works because we know
+	 * shift counts are taken mod 64.
+	 */
+	before_mask = (1ULL << (s_int << 3)) - 1;
+	v = (*p | before_mask) ^ (goal & before_mask);
+
+	/* Compute the address of the last byte. */
+	last_byte_ptr = (const char *)s + n - 1;
+
+	/* Compute the address of the word containing the last byte. */
+	last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8);
+
+	while ((bits = __insn_v1cmpeq(v, goal)) == 0) {
+		if (__builtin_expect(p == last_word_ptr, 0)) {
+			/* We already read the last word in the array,
+			 * so give up.
+			 */
+			return NULL;
+		}
+		v = *++p;
+	}
+
+	/* We found a match, but it might be in a byte past the end
+	 * of the array.
+	 */
+	ret = ((char *)p) + (__insn_ctz(bits) >> 3);
+	return (ret <= last_byte_ptr) ? ret : NULL;
+}
+EXPORT_SYMBOL(memchr);
--- a/arch/tile/lib/memcpy_64.c
+++ b/arch/tile/lib/memcpy_64.c
@ -0,0 +1,220 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#define __memcpy memcpy
+/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
+
+/* Must be 8 bytes in size. */
+#define word_t uint64_t
+
+#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
+#error "Assumes 64 or 128 byte line size"
+#endif
+
+/* How many cache lines ahead should we prefetch? */
+#define PREFETCH_LINES_AHEAD 3
+
+/*
+ * Provide "base versions" of load and store for the normal code path.
+ * The kernel provides other versions for userspace copies.
+ */
+#define ST(p, v) (*(p) = (v))
+#define LD(p) (*(p))
+
+#ifndef USERCOPY_FUNC
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#define RETVAL dstv
+void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#else
+/*
+ * Special kernel version will provide implementation of the LDn/STn
+ * macros to return a count of uncopied bytes due to mm fault.
+ */
+#define RETVAL 0
+int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#endif
+{
+	char *__restrict dst1 = (char *)dstv;
+	const char *__restrict src1 = (const char *)srcv;
+	const char *__restrict src1_end;
+	const char *__restrict prefetch;
+	word_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
+	word_t final; /* Final bytes to write to trailing word, if any */
+	long i;
+
+	if (n < 16) {
+		for (; n; n--)
+			ST1(dst1++, LD1(src1++));
+		return RETVAL;
+	}
+
+	/*
+	 * Locate the end of source memory we will copy.  Don't
+	 * prefetch past this.
+	 */
+	src1_end = src1 + n - 1;
+
+	/* Prefetch ahead a few cache lines, but not past the end. */
+	prefetch = src1;
+	for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
+		__insn_prefetch(prefetch);
+		prefetch += CHIP_L2_LINE_SIZE();
+		prefetch = (prefetch > src1_end) ? prefetch : src1;
+	}
+
+	/* Copy bytes until dst is word-aligned. */
+	for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
+		ST1(dst1++, LD1(src1++));
+
+	/* 8-byte pointer to destination memory. */
+	dst8 = (word_t *)dst1;
+
+	if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
+		/*
+		 * Misaligned copy.  Copy 8 bytes at a time, but don't
+		 * bother with other fanciness.
+		 *
+		 * TODO: Consider prefetching and using wh64 as well.
+		 */
+
+		/* Create an aligned src8. */
+		const word_t *__restrict src8 =
+			(const word_t *)((uintptr_t)src1 & -sizeof(word_t));
+		word_t b;
+
+		word_t a = LD8(src8++);
+		for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
+			b = LD8(src8++);
+			a = __insn_dblalign(a, b, src1);
+			ST8(dst8++, a);
+			a = b;
+		}
+
+		if (n == 0)
+			return RETVAL;
+
+		b = ((const char *)src8 <= src1_end) ? *src8 : 0;
+
+		/*
+		 * Final source bytes to write to trailing partial
+		 * word, if any.
+		 */
+		final = __insn_dblalign(a, b, src1);
+	} else {
+		/* Aligned copy. */
+
+		const word_t* __restrict src8 = (const word_t *)src1;
+
+		/* src8 and dst8 are both word-aligned. */
+		if (n >= CHIP_L2_LINE_SIZE()) {
+			/* Copy until 'dst' is cache-line-aligned. */
+			for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
+			     n -= sizeof(word_t))
+				ST8(dst8++, LD8(src8++));
+
+			for (; n >= CHIP_L2_LINE_SIZE(); ) {
+				__insn_wh64(dst8);
+
+				/*
+				 * Prefetch and advance to next line
+				 * to prefetch, but don't go past the end
+				 */
+				__insn_prefetch(prefetch);
+				prefetch += CHIP_L2_LINE_SIZE();
+				prefetch = (prefetch > src1_end) ? prefetch :
+					(const char *)src8;
+
+				/*
+				 * Copy an entire cache line.  Manually
+				 * unrolled to avoid idiosyncracies of
+				 * compiler unrolling.
+				 */
+#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
+				COPY_WORD(0);
+				COPY_WORD(1);
+				COPY_WORD(2);
+				COPY_WORD(3);
+				COPY_WORD(4);
+				COPY_WORD(5);
+				COPY_WORD(6);
+				COPY_WORD(7);
+#if CHIP_L2_LINE_SIZE() == 128
+				COPY_WORD(8);
+				COPY_WORD(9);
+				COPY_WORD(10);
+				COPY_WORD(11);
+				COPY_WORD(12);
+				COPY_WORD(13);
+				COPY_WORD(14);
+				COPY_WORD(15);
+#elif CHIP_L2_LINE_SIZE() != 64
+# error Fix code that assumes particular L2 cache line sizes
+#endif
+
+				dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+				src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+			}
+		}
+
+		for (; n >= sizeof(word_t); n -= sizeof(word_t))
+			ST8(dst8++, LD8(src8++));
+
+		if (__builtin_expect(n == 0, 1))
+			return RETVAL;
+
+		final = LD8(src8);
+	}
+
+	/* n != 0 if we get here.  Write out any trailing bytes. */
+	dst1 = (char *)dst8;
+	if (n & 4) {
+		ST4((uint32_t *)dst1, final);
+		dst1 += 4;
+		final >>= 32;
+		n &= 3;
+	}
+	if (n & 2) {
+		ST2((uint16_t *)dst1, final);
+		dst1 += 2;
+		final >>= 16;
+		n &= 1;
+	}
+	if (n)
+		ST1((uint8_t *)dst1, final);
+
+	return RETVAL;
+}
+
+
+#ifdef USERCOPY_FUNC
+#undef ST1
+#undef ST2
+#undef ST4
+#undef ST8
+#undef LD1
+#undef LD2
+#undef LD4
+#undef LD8
+#undef USERCOPY_FUNC
+#endif
--- a/arch/tile/lib/memcpy_user_64.c
+++ b/arch/tile/lib/memcpy_user_64.c
@ -0,0 +1,86 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Do memcpy(), but trap and return "n" when a load or store faults.
+ *
+ * Note: this idiom only works when memcpy() compiles to a leaf function.
+ * If "sp" is updated during memcpy, the "jrp lr" will be incorrect.
+ *
+ * Also note that we are capturing "n" from the containing scope here.
+ */
+
+#define _ST(p, inst, v)						\
+	({							\
+		asm("1: " #inst " %0, %1;"			\
+		    ".pushsection .coldtext.memcpy,\"ax\";"	\
+		    "2: { move r0, %2; jrp lr };"		\
+		    ".section __ex_table,\"a\";"		\
+		    ".quad 1b, 2b;"				\
+		    ".popsection"				\
+		    : "=m" (*(p)) : "r" (v), "r" (n));		\
+	})
+
+#define _LD(p, inst)						\
+	({							\
+		unsigned long __v;				\
+		asm("1: " #inst " %0, %1;"			\
+		    ".pushsection .coldtext.memcpy,\"ax\";"	\
+		    "2: { move r0, %2; jrp lr };"		\
+		    ".section __ex_table,\"a\";"		\
+		    ".quad 1b, 2b;"				\
+		    ".popsection"				\
+		    : "=r" (__v) : "m" (*(p)), "r" (n));	\
+		__v;						\
+	})
+
+#define USERCOPY_FUNC __copy_to_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#include "memcpy_64.c"
+
+#define USERCOPY_FUNC __copy_from_user_inatomic
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+
+#define USERCOPY_FUNC __copy_in_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+
+unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
+				       unsigned long n)
+{
+	unsigned long rc = __copy_from_user_inatomic(to, from, n);
+	if (unlikely(rc))
+		memset(to + n - rc, 0, rc);
+	return rc;
+}
--- a/arch/tile/lib/memset_64.c
+++ b/arch/tile/lib/memset_64.c
@ -0,0 +1,145 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <arch/chip.h>
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memset
+
+void *memset(void *s, int c, size_t n)
+{
+	uint64_t *out64;
+	int n64, to_align64;
+	uint64_t v64;
+	uint8_t *out8 = s;
+
+	/* Experimentation shows that a trivial tight loop is a win up until
+	 * around a size of 20, where writing a word at a time starts to win.
+	 */
+#define BYTE_CUTOFF 20
+
+#if BYTE_CUTOFF < 7
+	/* This must be at least at least this big, or some code later
+	 * on doesn't work.
+	 */
+#error "BYTE_CUTOFF is too small"
+#endif
+
+	if (n < BYTE_CUTOFF) {
+		/* Strangely, this turns out to be the tightest way to
+		 * write this loop.
+		 */
+		if (n != 0) {
+			do {
+				/* Strangely, combining these into one line
+				 * performs worse.
+				 */
+				*out8 = c;
+				out8++;
+			} while (--n != 0);
+		}
+
+		return s;
+	}
+
+	/* Align 'out8'. We know n >= 7 so this won't write past the end. */
+	while (((uintptr_t) out8 & 7) != 0) {
+		*out8++ = c;
+		--n;
+	}
+
+	/* Align 'n'. */
+	while (n & 7)
+		out8[--n] = c;
+
+	out64 = (uint64_t *) out8;
+	n64 = n >> 3;
+
+	/* Tile input byte out to 64 bits. */
+	/* KLUDGE */
+	v64 = 0x0101010101010101ULL * (uint8_t)c;
+
+	/* This must be at least 8 or the following loop doesn't work. */
+#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
+
+	/* Determine how many words we need to emit before the 'out32'
+	 * pointer becomes aligned modulo the cache line size.
+	 */
+	to_align64 = (-((uintptr_t)out64 >> 3)) &
+		(CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1);
+
+	/* Only bother aligning and using wh64 if there is at least
+	 * one full cache line to process.  This check also prevents
+	 * overrunning the end of the buffer with alignment words.
+	 */
+	if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) {
+		int lines_left;
+
+		/* Align out64 mod the cache line size so we can use wh64. */
+		n64 -= to_align64;
+		for (; to_align64 != 0; to_align64--) {
+			*out64 = v64;
+			out64++;
+		}
+
+		/* Use unsigned divide to turn this into a right shift. */
+		lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+
+		do {
+			/* Only wh64 a few lines at a time, so we don't
+			 * exceed the maximum number of victim lines.
+			 */
+			int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
+				  ? lines_left
+				  : CHIP_MAX_OUTSTANDING_VICTIMS());
+			uint64_t *wh = out64;
+			int i = x;
+			int j;
+
+			lines_left -= x;
+
+			do {
+				__insn_wh64(wh);
+				wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+			} while (--i);
+
+			for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4);
+			     j != 0; j--) {
+				*out64++ = v64;
+				*out64++ = v64;
+				*out64++ = v64;
+				*out64++ = v64;
+			}
+		} while (lines_left != 0);
+
+		/* We processed all full lines above, so only this many
+		 * words remain to be processed.
+		 */
+		n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1;
+	}
+
+	/* Now handle any leftover values. */
+	if (n64 != 0) {
+		do {
+			*out64 = v64;
+			out64++;
+		} while (--n64 != 0);
+	}
+
+	return s;
+}
+EXPORT_SYMBOL(memset);
--- a/arch/tile/lib/spinlock_64.c
+++ b/arch/tile/lib/spinlock_64.c
@ -0,0 +1,104 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+
+#include "spinlock_common.h"
+
+/*
+ * Read the spinlock value without allocating in our cache and without
+ * causing an invalidation to another cpu with a copy of the cacheline.
+ * This is important when we are spinning waiting for the lock.
+ */
+static inline u32 arch_spin_read_noalloc(void *lock)
+{
+	return atomic_cmpxchg((atomic_t *)lock, -1, -1);
+}
+
+/*
+ * Wait until the high bits (current) match my ticket.
+ * If we notice the overflow bit set on entry, we clear it.
+ */
+void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket)
+{
+	if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) {
+		__insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW);
+		my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW;
+	}
+
+	for (;;) {
+		u32 val = arch_spin_read_noalloc(lock);
+		u32 delta = my_ticket - arch_spin_current(val);
+		if (delta == 0)
+			return;
+		relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
+	}
+}
+EXPORT_SYMBOL(arch_spin_lock_slow);
+
+/*
+ * Check the lock to see if it is plausible, and try to get it with cmpxchg().
+ */
+int arch_spin_trylock(arch_spinlock_t *lock)
+{
+	u32 val = arch_spin_read_noalloc(lock);
+	if (unlikely(arch_spin_current(val) != arch_spin_next(val)))
+		return 0;
+	return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW)
+		== val;
+}
+EXPORT_SYMBOL(arch_spin_trylock);
+
+void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	u32 iterations = 0;
+	while (arch_spin_is_locked(lock))
+		delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_spin_unlock_wait);
+
+/*
+ * If the read lock fails due to a writer, we retry periodically
+ * until the value is positive and we write our incremented reader count.
+ */
+void __read_lock_failed(arch_rwlock_t *rw)
+{
+	u32 val;
+	int iterations = 0;
+	do {
+		delay_backoff(iterations++);
+		val = __insn_fetchaddgez4(&rw->lock, 1);
+	} while (unlikely(arch_write_val_locked(val)));
+}
+EXPORT_SYMBOL(__read_lock_failed);
+
+/*
+ * If we failed because there were readers, clear the "writer" bit
+ * so we don't block additional readers.  Otherwise, there was another
+ * writer anyway, so our "fetchor" made no difference.  Then wait,
+ * issuing periodic fetchor instructions, till we get the lock.
+ */
+void __write_lock_failed(arch_rwlock_t *rw, u32 val)
+{
+	int iterations = 0;
+	do {
+		if (!arch_write_val_locked(val))
+			val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
+		delay_backoff(iterations++);
+		val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
+	} while (val != 0);
+}
+EXPORT_SYMBOL(__write_lock_failed);
--- a/arch/tile/lib/strchr_64.c
+++ b/arch/tile/lib/strchr_64.c
@ -0,0 +1,67 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef strchr
+
+char *strchr(const char *s, int c)
+{
+	int z, g;
+
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint64_t *p = (const uint64_t *)(s_int & -8);
+
+	/* Create eight copies of the byte for which we are looking. */
+	const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
+
+	/* Read the first aligned word, but force bytes before the string to
+	 * match neither zero nor goal (we make sure the high bit of each
+	 * byte is 1, and the low 7 bits are all the opposite of the goal
+	 * byte).
+	 *
+	 * Note that this shift count expression works because we know shift
+	 * counts are taken mod 64.
+	 */
+	const uint64_t before_mask = (1ULL << (s_int << 3)) - 1;
+	uint64_t v = (*p | before_mask) ^
+		(goal & __insn_v1shrsi(before_mask, 1));
+
+	uint64_t zero_matches, goal_matches;
+	while (1) {
+		/* Look for a terminating '\0'. */
+		zero_matches = __insn_v1cmpeqi(v, 0);
+
+		/* Look for the goal byte. */
+		goal_matches = __insn_v1cmpeq(v, goal);
+
+		if (__builtin_expect((zero_matches | goal_matches) != 0, 0))
+			break;
+
+		v = *++p;
+	}
+
+	z = __insn_ctz(zero_matches);
+	g = __insn_ctz(goal_matches);
+
+	/* If we found c before '\0' we got a match. Note that if c == '\0'
+	 * then g == z, and we correctly return the address of the '\0'
+	 * rather than NULL.
+	 */
+	return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
+}
+EXPORT_SYMBOL(strchr);
--- a/arch/tile/lib/strlen_64.c
+++ b/arch/tile/lib/strlen_64.c
@ -0,0 +1,38 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef strlen
+
+size_t strlen(const char *s)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint64_t *p = (const uint64_t *)(s_int & -8);
+
+	/* Read the first word, but force bytes before the string to be nonzero.
+	 * This expression works because we know shift counts are taken mod 64.
+	 */
+	uint64_t v = *p | ((1ULL << (s_int << 3)) - 1);
+
+	uint64_t bits;
+	while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
+		v = *++p;
+
+	return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
+}
+EXPORT_SYMBOL(strlen);
--- a/arch/tile/lib/usercopy_64.S
+++ b/arch/tile/lib/usercopy_64.S
@ -0,0 +1,196 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/cache.h>
+#include <arch/chip.h>
+
+/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
+
+	.pushsection .fixup,"ax"
+
+get_user_fault:
+	{ movei r1, -EFAULT; move r0, zero }
+	jrp lr
+	ENDPROC(get_user_fault)
+
+put_user_fault:
+	{ movei r0, -EFAULT; jrp lr }
+	ENDPROC(put_user_fault)
+
+	.popsection
+
+/*
+ * __get_user_N functions take a pointer in r0, and return 0 in r1
+ * on success, with the value in r0; or else -EFAULT in r1.
+ */
+#define __get_user_N(bytes, LOAD) \
+	STD_ENTRY(__get_user_##bytes); \
+1:	{ LOAD r0, r0; move r1, zero }; \
+	jrp lr; \
+	STD_ENDPROC(__get_user_##bytes); \
+	.pushsection __ex_table,"a"; \
+	.quad 1b, get_user_fault; \
+	.popsection
+
+__get_user_N(1, ld1u)
+__get_user_N(2, ld2u)
+__get_user_N(4, ld4u)
+__get_user_N(8, ld)
+
+/*
+ * __put_user_N functions take a value in r0 and a pointer in r1,
+ * and return 0 in r0 on success or -EFAULT on failure.
+ */
+#define __put_user_N(bytes, STORE) \
+	STD_ENTRY(__put_user_##bytes); \
+1:	{ STORE r1, r0; move r0, zero }; \
+	jrp lr; \
+	STD_ENDPROC(__put_user_##bytes); \
+	.pushsection __ex_table,"a"; \
+	.quad 1b, put_user_fault; \
+	.popsection
+
+__put_user_N(1, st1)
+__put_user_N(2, st2)
+__put_user_N(4, st4)
+__put_user_N(8, st)
+
+/*
+ * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
+ * It returns the length, including the terminating NUL, or zero on exception.
+ * If length is greater than the bound, returns one plus the bound.
+ */
+STD_ENTRY(strnlen_user_asm)
+	{ beqz r1, 2f; addi r3, r0, -1 }  /* bias down to include NUL */
+1:      { ld1u r4, r0; addi r1, r1, -1 }
+	beqz r4, 2f
+	{ bnezt r1, 1b; addi r0, r0, 1 }
+2:      { sub r0, r0, r3; jrp lr }
+	STD_ENDPROC(strnlen_user_asm)
+	.pushsection .fixup,"ax"
+strnlen_user_fault:
+	{ move r0, zero; jrp lr }
+	ENDPROC(strnlen_user_fault)
+	.section __ex_table,"a"
+	.quad 1b, strnlen_user_fault
+	.popsection
+
+/*
+ * strncpy_from_user_asm takes the kernel target pointer in r0,
+ * the userspace source pointer in r1, and the length bound (including
+ * the trailing NUL) in r2.  On success, it returns the string length
+ * (not including the trailing NUL), or -EFAULT on failure.
+ */
+STD_ENTRY(strncpy_from_user_asm)
+	{ beqz r2, 2f; move r3, r0 }
+1:      { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+	{ st1 r0, r4; addi r0, r0, 1 }
+	beqz r2, 2f
+	bnezt r4, 1b
+	addi r0, r0, -1   /* don't count the trailing NUL */
+2:      { sub r0, r0, r3; jrp lr }
+	STD_ENDPROC(strncpy_from_user_asm)
+	.pushsection .fixup,"ax"
+strncpy_from_user_fault:
+	{ movei r0, -EFAULT; jrp lr }
+	ENDPROC(strncpy_from_user_fault)
+	.section __ex_table,"a"
+	.quad 1b, strncpy_from_user_fault
+	.popsection
+
+/*
+ * clear_user_asm takes the user target address in r0 and the
+ * number of bytes to zero in r1.
+ * It returns the number of uncopiable bytes (hopefully zero) in r0.
+ * Note that we don't use a separate .fixup section here since we fall
+ * through into the "fixup" code as the last straight-line bundle anyway.
+ */
+STD_ENTRY(clear_user_asm)
+	{ beqz r1, 2f; or r2, r0, r1 }
+	andi r2, r2, 7
+	beqzt r2, .Lclear_aligned_user_asm
+1:      { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
+	bnezt r1, 1b
+2:      { move r0, r1; jrp lr }
+	.pushsection __ex_table,"a"
+	.quad 1b, 2b
+	.popsection
+
+.Lclear_aligned_user_asm:
+1:      { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 }
+	bnezt r1, 1b
+2:      { move r0, r1; jrp lr }
+	STD_ENDPROC(clear_user_asm)
+	.pushsection __ex_table,"a"
+	.quad 1b, 2b
+	.popsection
+
+/*
+ * flush_user_asm takes the user target address in r0 and the
+ * number of bytes to flush in r1.
+ * It returns the number of unflushable bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(flush_user_asm)
+	beqz r1, 2f
+	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+	{ and r0, r0, r2; and r1, r1, r2 }
+	{ sub r1, r1, r0 }
+1:      { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
+	{ addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+	STD_ENDPROC(flush_user_asm)
+	.pushsection __ex_table,"a"
+	.quad 1b, 2b
+	.popsection
+
+/*
+ * inv_user_asm takes the user target address in r0 and the
+ * number of bytes to invalidate in r1.
+ * It returns the number of not inv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(inv_user_asm)
+	beqz r1, 2f
+	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+	{ and r0, r0, r2; and r1, r1, r2 }
+	{ sub r1, r1, r0 }
+1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
+	{ addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+	STD_ENDPROC(inv_user_asm)
+	.pushsection __ex_table,"a"
+	.quad 1b, 2b
+	.popsection
+
+/*
+ * finv_user_asm takes the user target address in r0 and the
+ * number of bytes to flush-invalidate in r1.
+ * It returns the number of not finv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(finv_user_asm)
+	beqz r1, 2f
+	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+	{ and r0, r0, r2; and r1, r1, r2 }
+	{ sub r1, r1, r0 }
+1:      { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
+	{ addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b }
+2:      { move r0, r1; jrp lr }
+	STD_ENDPROC(finv_user_asm)
+	.pushsection __ex_table,"a"
+	.quad 1b, 2b
+	.popsection
--- a/arch/tile/mm/migrate_64.S
+++ b/arch/tile/mm/migrate_64.S
@ -0,0 +1,187 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * This routine is a helper for migrating the home of a set of pages to
+ * a new cpu.  See the documentation in homecache.c for more information.
+ */
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/types.h>
+#include <asm/asm-offsets.h>
+#include <hv/hypervisor.h>
+
+	.text
+
+/*
+ * First, some definitions that apply to all the code in the file.
+ */
+
+/* Locals (caller-save) */
+#define r_tmp		r10
+#define r_save_sp	r11
+
+/* What we save where in the stack frame; must include all callee-saves. */
+#define FRAME_SP	8
+#define FRAME_R30	16
+#define FRAME_R31	24
+#define FRAME_R32	32
+#define FRAME_R33	40
+#define FRAME_SIZE	48
+
+
+
+
+/*
+ * On entry:
+ *
+ *   r0 the new context PA to install (moved to r_context)
+ *   r1 PTE to use for context access (moved to r_access)
+ *   r2 ASID to use for new context (moved to r_asid)
+ *   r3 pointer to cpumask with just this cpu set in it (r_my_cpumask)
+ */
+
+/* Arguments (caller-save) */
+#define r_context_in	r0
+#define r_access_in	r1
+#define r_asid_in	r2
+#define r_my_cpumask	r3
+
+/* Locals (callee-save); must not be more than FRAME_xxx above. */
+#define r_save_ics	r30
+#define r_context	r31
+#define r_access	r32
+#define r_asid		r33
+
+/*
+ * Caller-save locals and frame constants are the same as
+ * for homecache_migrate_stack_and_flush.
+ */
+
+STD_ENTRY(flush_and_install_context)
+	/*
+	 * Create a stack frame; we can't touch it once we flush the
+	 * cache until we install the new page table and flush the TLB.
+	 */
+	{
+	 move r_save_sp, sp
+	 st sp, lr
+	 addi sp, sp, -FRAME_SIZE
+	}
+	addi r_tmp, sp, FRAME_SP
+	{
+	 st r_tmp, r_save_sp
+	 addi r_tmp, sp, FRAME_R30
+	}
+	{
+	 st r_tmp, r30
+	 addi r_tmp, sp, FRAME_R31
+	}
+	{
+	 st r_tmp, r31
+	 addi r_tmp, sp, FRAME_R32
+	}
+	{
+	 st r_tmp, r32
+	 addi r_tmp, sp, FRAME_R33
+	}
+	st r_tmp, r33
+
+	/* Move some arguments to callee-save registers. */
+	{
+	 move r_context, r_context_in
+	 move r_access, r_access_in
+	}
+	move r_asid, r_asid_in
+
+	/* Disable interrupts, since we can't use our stack. */
+	{
+	 mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION
+	 movei r_tmp, 1
+	}
+	mtspr INTERRUPT_CRITICAL_SECTION, r_tmp
+
+	/* First, flush our L2 cache. */
+	{
+	 move r0, zero  /* cache_pa */
+	 moveli r1, hw2_last(HV_FLUSH_EVICT_L2)  /* cache_control */
+	}
+	{
+	 shl16insli r1, r1, hw1(HV_FLUSH_EVICT_L2)
+	 move r2, r_my_cpumask  /* cache_cpumask */
+	}
+	{
+	 shl16insli r1, r1, hw0(HV_FLUSH_EVICT_L2)
+	 move r3, zero  /* tlb_va */
+	}
+	{
+	 move r4, zero  /* tlb_length */
+	 move r5, zero  /* tlb_pgsize */
+	}
+	{
+	 move r6, zero  /* tlb_cpumask */
+	 move r7, zero  /* asids */
+	}
+	{
+	 move r8, zero  /* asidcount */
+	 jal hv_flush_remote
+	}
+	bnez r0, 1f
+
+	/* Now install the new page table. */
+	{
+	 move r0, r_context
+	 move r1, r_access
+	}
+	{
+	 move r2, r_asid
+	 movei r3, HV_CTX_DIRECTIO
+	}
+	jal hv_install_context
+	bnez r0, 1f
+
+	/* Finally, flush the TLB. */
+	{
+	 movei r0, 0   /* preserve_global */
+	 jal hv_flush_all
+	}
+
+1:      /* Reset interrupts back how they were before. */
+	mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics
+
+	/* Restore the callee-saved registers and return. */
+	addli lr, sp, FRAME_SIZE
+	{
+	 ld lr, lr
+	 addli r_tmp, sp, FRAME_R30
+	}
+	{
+	 ld r30, r_tmp
+	 addli r_tmp, sp, FRAME_R31
+	}
+	{
+	 ld r31, r_tmp
+	 addli r_tmp, sp, FRAME_R32
+	}
+	{
+	 ld r32, r_tmp
+	 addli r_tmp, sp, FRAME_R33
+	}
+	{
+	 ld r33, r_tmp
+	 addi sp, sp, FRAME_SIZE
+	}
+	jrp lr
+	STD_ENDPROC(flush_and_install_context)
--- a/drivers/input/input-compat.h
+++ b/drivers/input/input-compat.h
@ -19,7 +19,7 @@

 /* Note to the author of this code: did it ever occur to
   you why the ifdefs are needed? Think about it again. -AK */
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) || defined(CONFIG_TILE)
 #  define INPUT_COMPAT_TEST is_compat_task()
 #elif defined(CONFIG_S390)
 #  define INPUT_COMPAT_TEST test_thread_flag(TIF_31BIT)