1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
|
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
*
* Iterators for Generic Page Table
*/
#ifndef __GENERIC_PT_PT_ITER_H
#define __GENERIC_PT_PT_ITER_H
#include "pt_common.h"
#include <linux/errno.h>
/*
* Use to mangle symbols so that backtraces and the symbol table are
* understandable. Any non-inlined function should get mangled like this.
*/
#define NS(fn) CONCATENATE(PTPFX, fn)
/**
* pt_check_range() - Validate the range can be iterated
* @range: Range to validate
*
* Check that VA and last_va fall within the permitted range of VAs. If the
* format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension
* is correct.
*/
static inline int pt_check_range(struct pt_range *range)
{
pt_vaddr_t prefix;
PT_WARN_ON(!range->max_vasz_lg2);
if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) {
PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2);
prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ?
PT_VADDR_MAX :
0;
} else {
prefix = pt_full_va_prefix(range->common);
}
if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) ||
!fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2))
return -ERANGE;
return 0;
}
/**
* pt_index_to_va() - Update range->va to the current pts->index
* @pts: Iteration State
*
* Adjust range->va to match the current index. This is done in a lazy manner
* since computing the VA takes several instructions and is rarely required.
*/
static inline void pt_index_to_va(struct pt_state *pts)
{
pt_vaddr_t lower_va;
lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts));
pts->range->va = fvalog2_set_mod(pts->range->va, lower_va,
pt_table_oa_lg2sz(pts));
}
/*
* Add index_count_lg2 number of entries to pts's VA and index. The VA will be
* adjusted to the end of the contiguous block if it is currently in the middle.
*/
static inline void _pt_advance(struct pt_state *pts,
unsigned int index_count_lg2)
{
pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
index_count_lg2);
}
/**
* pt_entry_fully_covered() - Check if the item or entry is entirely contained
* within pts->range
* @pts: Iteration State
* @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or
* pt_entry_oa_lg2sz()
*
* Returns: true if the item is fully enclosed by the pts->range.
*/
static inline bool pt_entry_fully_covered(const struct pt_state *pts,
unsigned int oasz_lg2)
{
struct pt_range *range = pts->range;
/* Range begins at the start of the entry */
if (log2_mod(pts->range->va, oasz_lg2))
return false;
/* Range ends past the end of the entry */
if (!log2_div_eq(range->va, range->last_va, oasz_lg2))
return true;
/* Range ends at the end of the entry */
return log2_mod_eq_max(range->last_va, oasz_lg2);
}
/**
* pt_range_to_index() - Starting index for an iteration
* @pts: Iteration State
*
* Return: the starting index for the iteration in pts.
*/
static inline unsigned int pt_range_to_index(const struct pt_state *pts)
{
unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
PT_WARN_ON(pts->level > pts->range->top_level);
if (pts->range->top_level == pts->level)
return log2_div(fvalog2_mod(pts->range->va,
pts->range->max_vasz_lg2),
isz_lg2);
return log2_mod(log2_div(pts->range->va, isz_lg2),
pt_num_items_lg2(pts));
}
/**
* pt_range_to_end_index() - Ending index iteration
* @pts: Iteration State
*
* Return: the last index for the iteration in pts.
*/
static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
{
unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
struct pt_range *range = pts->range;
unsigned int num_entries_lg2;
if (range->va == range->last_va)
return pts->index + 1;
if (pts->range->top_level == pts->level)
return log2_div(fvalog2_mod(pts->range->last_va,
pts->range->max_vasz_lg2),
isz_lg2) +
1;
num_entries_lg2 = pt_num_items_lg2(pts);
/* last_va falls within this table */
if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2))
return log2_mod(log2_div(pts->range->last_va, isz_lg2),
num_entries_lg2) +
1;
return log2_to_int(num_entries_lg2);
}
static inline void _pt_iter_first(struct pt_state *pts)
{
pts->index = pt_range_to_index(pts);
pts->end_index = pt_range_to_end_index(pts);
PT_WARN_ON(pts->index > pts->end_index);
}
static inline bool _pt_iter_load(struct pt_state *pts)
{
if (pts->index >= pts->end_index)
return false;
pt_load_entry(pts);
return true;
}
/**
* pt_next_entry() - Advance pts to the next entry
* @pts: Iteration State
*
* Update pts to go to the next index at this level. If pts is pointing at a
* contiguous entry then the index may advance my more than one.
*/
static inline void pt_next_entry(struct pt_state *pts)
{
if (pts->type == PT_ENTRY_OA &&
!__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0))
_pt_advance(pts, pt_entry_num_contig_lg2(pts));
else
pts->index++;
pt_index_to_va(pts);
}
/**
* for_each_pt_level_entry() - For loop wrapper over entries in the range
* @pts: Iteration State
*
* This is the basic iteration primitive. It iterates over all the entries in
* pts->range that fall within the pts's current table level. Each step does
* pt_load_entry(pts).
*/
#define for_each_pt_level_entry(pts) \
for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts))
/**
* pt_load_single_entry() - Version of pt_load_entry() usable within a walker
* @pts: Iteration State
*
* Alternative to for_each_pt_level_entry() if the walker function uses only a
* single entry.
*/
static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts)
{
pts->index = pt_range_to_index(pts);
pt_load_entry(pts);
return pts->type;
}
static __always_inline struct pt_range _pt_top_range(struct pt_common *common,
uintptr_t top_of_table)
{
struct pt_range range = {
.common = common,
.top_table =
(struct pt_table_p *)(top_of_table &
~(uintptr_t)PT_TOP_LEVEL_MASK),
.top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),
};
struct pt_state pts = { .range = &range, .level = range.top_level };
unsigned int max_vasz_lg2;
max_vasz_lg2 = common->max_vasz_lg2;
if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
pts.level != PT_MAX_TOP_LEVEL)
max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2,
pt_num_items_lg2(&pts) +
pt_table_item_lg2sz(&pts));
/*
* The top range will default to the lower region only with sign extend.
*/
range.max_vasz_lg2 = max_vasz_lg2;
if (pt_feature(common, PT_FEAT_SIGN_EXTEND))
max_vasz_lg2--;
range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2);
range.last_va =
fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2);
return range;
}
/**
* pt_top_range() - Return a range that spans part of the top level
* @common: Table
*
* For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the
* total page table. Otherwise it returns the entire page table.
*/
static __always_inline struct pt_range pt_top_range(struct pt_common *common)
{
/*
* The top pointer can change without locking. We capture the value and
* it's level here and are safe to walk it so long as both values are
* captured without tearing.
*/
return _pt_top_range(common, READ_ONCE(common->top_of_table));
}
/**
* pt_all_range() - Return a range that spans the entire page table
* @common: Table
*
* The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND
* is supported range->va and range->last_va will be incorrect during the
* iteration and must not be accessed.
*/
static inline struct pt_range pt_all_range(struct pt_common *common)
{
struct pt_range range = pt_top_range(common);
if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
return range;
/*
* Pretend the table is linear from 0 without a sign extension. This
* generates the correct indexes for iteration.
*/
range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2);
return range;
}
/**
* pt_upper_range() - Return a range that spans part of the top level
* @common: Table
*
* For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the
* total page table. Otherwise it returns the entire page table.
*/
static inline struct pt_range pt_upper_range(struct pt_common *common)
{
struct pt_range range = pt_top_range(common);
if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
return range;
range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1);
range.last_va = PT_VADDR_MAX;
return range;
}
/**
* pt_make_range() - Return a range that spans part of the table
* @common: Table
* @va: Start address
* @last_va: Last address
*
* The caller must validate the range with pt_check_range() before using it.
*/
static __always_inline struct pt_range
pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va)
{
struct pt_range range =
_pt_top_range(common, READ_ONCE(common->top_of_table));
range.va = va;
range.last_va = last_va;
return range;
}
/*
* Span a slice of the table starting at a lower table level from an active
* walk.
*/
static __always_inline struct pt_range
pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va,
pt_vaddr_t last_va)
{
struct pt_range range = *parent;
range.va = va;
range.last_va = last_va;
PT_WARN_ON(last_va < va);
PT_WARN_ON(pt_check_range(&range));
return range;
}
/**
* pt_init() - Initialize a pt_state on the stack
* @range: Range pointer to embed in the state
* @level: Table level for the state
* @table: Pointer to the table memory at level
*
* Helper to initialize the on-stack pt_state from walker arguments.
*/
static __always_inline struct pt_state
pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table)
{
struct pt_state pts = {
.range = range,
.table = table,
.level = level,
};
return pts;
}
/**
* pt_init_top() - Initialize a pt_state on the stack
* @range: Range pointer to embed in the state
*
* The pt_state points to the top most level.
*/
static __always_inline struct pt_state pt_init_top(struct pt_range *range)
{
return pt_init(range, range->top_level, range->top_table);
}
typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg,
unsigned int level, struct pt_table_p *table);
/**
* pt_descend() - Recursively invoke the walker for the lower level
* @pts: Iteration State
* @arg: Value to pass to the function
* @fn: Walker function to call
*
* pts must point to a table item. Invoke fn as a walker on the table
* pts points to.
*/
static __always_inline int pt_descend(struct pt_state *pts, void *arg,
pt_level_fn_t fn)
{
int ret;
if (PT_WARN_ON(!pts->table_lower))
return -EINVAL;
ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower);
return ret;
}
/**
* pt_walk_range() - Walk over a VA range
* @range: Range pointer
* @fn: Walker function to call
* @arg: Value to pass to the function
*
* Walk over a VA range. The caller should have done a validity check, at
* least calling pt_check_range(), when building range. The walk will
* start at the top most table.
*/
static __always_inline int pt_walk_range(struct pt_range *range,
pt_level_fn_t fn, void *arg)
{
return fn(range, arg, range->top_level, range->top_table);
}
/*
* pt_walk_descend() - Recursively invoke the walker for a slice of a lower
* level
* @pts: Iteration State
* @va: Start address
* @last_va: Last address
* @fn: Walker function to call
* @arg: Value to pass to the function
*
* With pts pointing at a table item this will descend and over a slice of the
* lower table. The caller must ensure that va/last_va are within the table
* item. This creates a new walk and does not alter pts or pts->range.
*/
static __always_inline int pt_walk_descend(const struct pt_state *pts,
pt_vaddr_t va, pt_vaddr_t last_va,
pt_level_fn_t fn, void *arg)
{
struct pt_range range = pt_make_child_range(pts->range, va, last_va);
if (PT_WARN_ON(!pt_can_have_table(pts)) ||
PT_WARN_ON(!pts->table_lower))
return -EINVAL;
return fn(&range, arg, pts->level - 1, pts->table_lower);
}
/*
* pt_walk_descend_all() - Recursively invoke the walker for a table item
* @parent_pts: Iteration State
* @fn: Walker function to call
* @arg: Value to pass to the function
*
* With pts pointing at a table item this will descend and over the entire lower
* table. This creates a new walk and does not alter pts or pts->range.
*/
static __always_inline int
pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
void *arg)
{
unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts);
return pt_walk_descend(parent_pts,
log2_set_mod(parent_pts->range->va, 0, isz_lg2),
log2_set_mod_max(parent_pts->range->va, isz_lg2),
fn, arg);
}
/**
* pt_range_slice() - Return a range that spans indexes
* @pts: Iteration State
* @start_index: Starting index within pts
* @end_index: Ending index within pts
*
* Create a range than spans an index range of the current table level
* pt_state points at.
*/
static inline struct pt_range pt_range_slice(const struct pt_state *pts,
unsigned int start_index,
unsigned int end_index)
{
unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
pt_vaddr_t last_va;
pt_vaddr_t va;
va = fvalog2_set_mod(pts->range->va,
log2_mul(start_index, pt_table_item_lg2sz(pts)),
table_lg2sz);
last_va = fvalog2_set_mod(
pts->range->va,
log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz);
return pt_make_child_range(pts->range, va, last_va);
}
/**
* pt_top_memsize_lg2()
* @common: Table
* @top_of_table: Top of table value from _pt_top_set()
*
* Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this
* will compute the top size assuming the table will grow.
*/
static inline unsigned int pt_top_memsize_lg2(struct pt_common *common,
uintptr_t top_of_table)
{
struct pt_range range = _pt_top_range(common, top_of_table);
struct pt_state pts = pt_init_top(&range);
unsigned int num_items_lg2;
num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts);
if (range.top_level != PT_MAX_TOP_LEVEL &&
pt_feature(common, PT_FEAT_DYNAMIC_TOP))
num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts));
/* Round up the allocation size to the minimum alignment */
return max(ffs_t(u64, PT_TOP_PHYS_MASK),
num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE));
}
/**
* pt_compute_best_pgsize() - Determine the best page size for leaf entries
* @pgsz_bitmap: Permitted page sizes
* @va: Starting virtual address for the leaf entry
* @last_va: Last virtual address for the leaf entry, sets the max page size
* @oa: Starting output address for the leaf entry
*
* Compute the largest page size for va, last_va, and oa together and return it
* in lg2. The largest page size depends on the format's supported page sizes at
* this level, and the relative alignment of the VA and OA addresses. 0 means
* the OA cannot be stored with the provided pgsz_bitmap.
*/
static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
pt_vaddr_t va,
pt_vaddr_t last_va,
pt_oaddr_t oa)
{
unsigned int best_pgsz_lg2;
unsigned int pgsz_lg2;
pt_vaddr_t len = last_va - va + 1;
pt_vaddr_t mask;
if (PT_WARN_ON(va >= last_va))
return 0;
/*
* Given a VA/OA pair the best page size is the largest page size
* where:
*
* 1) VA and OA start at the page. Bitwise this is the count of least
* significant 0 bits.
* This also implies that last_va/oa has the same prefix as va/oa.
*/
mask = va | oa;
/*
* 2) The page size is not larger than the last_va (length). Since page
* sizes are always power of two this can't be larger than the
* largest power of two factor of the length.
*/
mask |= log2_to_int(vafls(len) - 1);
best_pgsz_lg2 = vaffs(mask);
/* Choose the highest bit <= best_pgsz_lg2 */
if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1)
pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1);
pgsz_lg2 = vafls(pgsz_bitmap);
if (!pgsz_lg2)
return 0;
pgsz_lg2--;
PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0);
PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0);
PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va);
PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
PT_WARN_ON(
!oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
return pgsz_lg2;
}
#define _PT_MAKE_CALL_LEVEL(fn) \
static __always_inline int fn(struct pt_range *range, void *arg, \
unsigned int level, \
struct pt_table_p *table) \
{ \
static_assert(PT_MAX_TOP_LEVEL <= 5); \
if (level == 0) \
return CONCATENATE(fn, 0)(range, arg, 0, table); \
if (level == 1 || PT_MAX_TOP_LEVEL == 1) \
return CONCATENATE(fn, 1)(range, arg, 1, table); \
if (level == 2 || PT_MAX_TOP_LEVEL == 2) \
return CONCATENATE(fn, 2)(range, arg, 2, table); \
if (level == 3 || PT_MAX_TOP_LEVEL == 3) \
return CONCATENATE(fn, 3)(range, arg, 3, table); \
if (level == 4 || PT_MAX_TOP_LEVEL == 4) \
return CONCATENATE(fn, 4)(range, arg, 4, table); \
return CONCATENATE(fn, 5)(range, arg, 5, table); \
}
static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg,
unsigned int unused_level,
struct pt_table_p *table)
{
static_assert(PT_MAX_TOP_LEVEL <= 5);
return -EPROTOTYPE;
}
#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \
static inline int fn(struct pt_range *range, void *arg, \
unsigned int unused_level, \
struct pt_table_p *table) \
{ \
return do_fn(range, arg, level, table, descend_fn); \
}
/**
* PT_MAKE_LEVELS() - Build an unwound walker
* @fn: Name of the walker function
* @do_fn: Function to call at each level
*
* This builds a function call tree that can be fully inlined.
* The caller must provide a function body in an __always_inline function::
*
* static __always_inline int do_fn(struct pt_range *range, void *arg,
* unsigned int level, struct pt_table_p *table,
* pt_level_fn_t descend_fn)
*
* An inline function will be created for each table level that calls do_fn with
* a compile time constant for level and a pointer to the next lower function.
* This generates an optimally inlined walk where each of the functions sees a
* constant level and can codegen the exact constants/etc for that level.
*
* Note this can produce a lot of code!
*/
#define PT_MAKE_LEVELS(fn, do_fn) \
__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \
do_fn); \
__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \
__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \
__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \
__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \
__PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \
_PT_MAKE_CALL_LEVEL(fn)
#endif
|