arch/tile/lib/atomic_asm_32.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

/*
 * Copyright 2010 Tilera Corporation. All Rights Reserved.
 *
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation, version 2.
 *
 *   This program is distributed in the hope that it will be useful, but
 *   WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
 *
 * Support routines for atomic operations.  Each function takes:
 *
 * r0: address to manipulate
 * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
 * r2: new value to write, or for cmpxchg/add_unless, value to compare against
 * r3: (cmpxchg/xchg_add_unless) new value to write or add;
 *     (atomic64 ops) high word of value to write
 * r4/r5: (cmpxchg64/add_unless64) new value to write or add
 *
 * The 32-bit routines return a "struct __get_user" so that the futex code
 * has an opportunity to return -EFAULT to the user if needed.
 * The 64-bit routines just return a "long long" with the value,
 * since they are only used from kernel space and don't expect to fault.
 * Support for 16-bit ops is included in the framework but we don't provide
 * any (x86_64 has an atomic_inc_short(), so we might want to some day).
 *
 * Note that the caller is advised to issue a suitable L1 or L2
 * prefetch on the address being manipulated to avoid extra stalls.
 * In addition, the hot path is on two icache lines, and we start with
 * a jump to the second line to make sure they are both in cache so
 * that we never stall waiting on icache fill while holding the lock.
 * (This doesn't work out with most 64-bit ops, since they consume
 * too many bundles, so may take an extra i-cache stall.)
 *
 * These routines set the INTERRUPT_CRITICAL_SECTION bit, just
 * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
 * the code, just page faults.
 *
 * If the load or store faults in a way that can be directly fixed in
 * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
 * directly, return to the instruction that faulted, and retry it.
 *
 * If the load or store faults in a way that potentially requires us
 * to release the atomic lock, then retry (e.g. a migrating PTE), we
 * reset the PC in do_page_fault_ics() to the "tns" instruction so
 * that on return we will reacquire the lock and restart the op.  We
 * are somewhat overloading the exception_table_entry notion by doing
 * this, since those entries are not normally used for migrating PTEs.
 *
 * If the main page fault handler discovers a bad address, it will see
 * the PC pointing to the "tns" instruction (due to the earlier
 * exception_table_entry processing in do_page_fault_ics), and
 * re-reset the PC to the fault handler, atomic_bad_address(), which
 * effectively takes over from the atomic op and can either return a
 * bad "struct __get_user" (for user addresses) or can just panic (for
 * bad kernel addresses).
 *
 * Note that if the value we would store is the same as what we
 * loaded, we bypass the store.  Other platforms with true atomics can
 * make the guarantee that a non-atomic __clear_bit(), for example,
 * can safely race with an atomic test_and_set_bit(); this example is
 * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do
 * that on Tile since the "atomic" op is really just a
 * read/modify/write, and can race with the non-atomic
 * read/modify/write.  However, if we can short-circuit the write when
 * it is not needed, in the atomic case, we avoid the race.
 */

#include <linux/linkage.h>
#include <asm/atomic_32.h>
#include <asm/page.h>
#include <asm/processor.h>

	.section .text.atomic,"ax"
ENTRY(__start_atomic_asm_code)

	.macro  atomic_op, name, bitwidth, body
	.align  64
STD_ENTRY_SECTION(__atomic\name, .text.atomic)
	{
	 movei  r24, 1
	 j      4f		/* branch to second cache line */
	}
1:	{
	 .ifc \bitwidth,16
	 lh     r22, r0
	 .else
	 lw     r22, r0
	 addi   r28, r0, 4
	 .endif
	}
	.ifc \bitwidth,64
	lw      r23, r28
	.endif
	\body /* set r24, and r25 if 64-bit */
	{
	 seq    r26, r22, r24
	 seq    r27, r23, r25
	}
	.ifc \bitwidth,64
	bbnst   r27, 2f
	.endif
	bbs     r26, 3f		/* skip write-back if it's the same value */
2:	{
	 .ifc \bitwidth,16
	 sh     r0, r24
	 .else
	 sw     r0, r24
	 .endif
	}
	.ifc \bitwidth,64
	sw      r28, r25
	.endif
	mf
3:	{
	 move   r0, r22
	 .ifc \bitwidth,64
	 move   r1, r23
	 .else
	 move   r1, zero
	 .endif
	 sw     ATOMIC_LOCK_REG_NAME, zero
	}
	mtspr   INTERRUPT_CRITICAL_SECTION, zero
	jrp     lr
4:	{
	 move   ATOMIC_LOCK_REG_NAME, r1
	 mtspr  INTERRUPT_CRITICAL_SECTION, r24
	}
#ifndef CONFIG_SMP
	j       1b		/* no atomic locks */
#else
	{
	 tns    r21, ATOMIC_LOCK_REG_NAME
	 moveli r23, 2048       /* maximum backoff time in cycles */
	}
	{
	 bzt    r21, 1b		/* branch if lock acquired */
	 moveli r25, 32         /* starting backoff time in cycles */
	}
5:	mtspr   INTERRUPT_CRITICAL_SECTION, zero
	mfspr   r26, CYCLE_LOW  /* get start point for this backoff */
6:	mfspr   r22, CYCLE_LOW  /* test to see if we've backed off enough */
	sub     r22, r22, r26
	slt     r22, r22, r25
	bbst    r22, 6b
	{
	 mtspr  INTERRUPT_CRITICAL_SECTION, r24
	 shli   r25, r25, 1     /* double the backoff; retry the tns */
	}
	{
	 tns    r21, ATOMIC_LOCK_REG_NAME
	 slt    r26, r23, r25   /* is the proposed backoff too big? */
	}
	{
	 bzt    r21, 1b		/* branch if lock acquired */
	 mvnz   r25, r26, r23
	}
	j       5b
#endif
	STD_ENDPROC(__atomic\name)
	.ifc \bitwidth,32
	.pushsection __ex_table,"a"
	.align  4
	.word   1b, __atomic\name
	.word   2b, __atomic\name
	.word   __atomic\name, __atomic_bad_address
	.popsection
	.endif
	.endm


/*
 * Use __atomic32 prefix to avoid collisions with GCC builtin __atomic functions.
 */

atomic_op 32_cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
atomic_op 32_xchg, 32, "move r24, r2"
atomic_op 32_xchg_add, 32, "add r24, r22, r2"
atomic_op 32_xchg_add_unless, 32, \
	"sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
atomic_op 32_fetch_or, 32, "or r24, r22, r2"
atomic_op 32_fetch_and, 32, "and r24, r22, r2"
atomic_op 32_fetch_andn, 32, "nor r2, r2, zero; and r24, r22, r2"
atomic_op 32_fetch_xor, 32, "xor r24, r22, r2"

atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
	{ bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
	slt_u r26, r24, r22; add r25, r25, r26"
atomic_op 64_xchg_add_unless, 64, \
	"{ sne r26, r22, r2; sne r27, r23, r3 }; \
	{ bbns r26, 3f; add r24, r22, r4 }; \
	{ bbns r27, 3f; add r25, r23, r5 }; \
	slt_u r26, r24, r22; add r25, r25, r26"
atomic_op 64_fetch_or, 64, "{ or r24, r22, r2; or r25, r23, r3 }"
atomic_op 64_fetch_and, 64, "{ and r24, r22, r2; and r25, r23, r3 }"
atomic_op 64_fetch_xor, 64, "{ xor r24, r22, r2; xor r25, r23, r3 }"

	jrp     lr              /* happy backtracer */

ENTRY(__end_atomic_asm_code)