kernel: Update MGLRU patchset
[openwrt/openwrt.git] / target / linux / generic / backport-5.15 / 020-v6.1-08-mm-multi-gen-LRU-support-page-table-walks.patch
1 From 05223c4e80b34e29f2255c04ffebc2c4475e7593 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Sun, 18 Sep 2022 02:00:05 -0600
4 Subject: [PATCH 08/29] mm: multi-gen LRU: support page table walks
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 To further exploit spatial locality, the aging prefers to walk page tables
10 to search for young PTEs and promote hot pages. A kill switch will be
11 added in the next patch to disable this behavior. When disabled, the
12 aging relies on the rmap only.
13
14 NB: this behavior has nothing similar with the page table scanning in the
15 2.4 kernel [1], which searches page tables for old PTEs, adds cold pages
16 to swapcache and unmaps them.
17
18 To avoid confusion, the term "iteration" specifically means the traversal
19 of an entire mm_struct list; the term "walk" will be applied to page
20 tables and the rmap, as usual.
21
22 An mm_struct list is maintained for each memcg, and an mm_struct follows
23 its owner task to the new memcg when this task is migrated. Given an
24 lruvec, the aging iterates lruvec_memcg()->mm_list and calls
25 walk_page_range() with each mm_struct on this list to promote hot pages
26 before it increments max_seq.
27
28 When multiple page table walkers iterate the same list, each of them gets
29 a unique mm_struct; therefore they can run concurrently. Page table
30 walkers ignore any misplaced pages, e.g., if an mm_struct was migrated,
31 pages it left in the previous memcg will not be promoted when its current
32 memcg is under reclaim. Similarly, page table walkers will not promote
33 pages from nodes other than the one under reclaim.
34
35 This patch uses the following optimizations when walking page tables:
36 1. It tracks the usage of mm_struct's between context switches so that
37 page table walkers can skip processes that have been sleeping since
38 the last iteration.
39 2. It uses generational Bloom filters to record populated branches so
40 that page table walkers can reduce their search space based on the
41 query results, e.g., to skip page tables containing mostly holes or
42 misplaced pages.
43 3. It takes advantage of the accessed bit in non-leaf PMD entries when
44 CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
45 4. It does not zigzag between a PGD table and the same PMD table
46 spanning multiple VMAs. IOW, it finishes all the VMAs within the
47 range of the same PMD table before it returns to a PGD table. This
48 improves the cache performance for workloads that have large
49 numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.
50
51 Server benchmark results:
52 Single workload:
53 fio (buffered I/O): no change
54
55 Single workload:
56 memcached (anon): +[8, 10]%
57 Ops/sec KB/sec
58 patch1-7: 1147696.57 44640.29
59 patch1-8: 1245274.91 48435.66
60
61 Configurations:
62 no change
63
64 Client benchmark results:
65 kswapd profiles:
66 patch1-7
67 48.16% lzo1x_1_do_compress (real work)
68 8.20% page_vma_mapped_walk (overhead)
69 7.06% _raw_spin_unlock_irq
70 2.92% ptep_clear_flush
71 2.53% __zram_bvec_write
72 2.11% do_raw_spin_lock
73 2.02% memmove
74 1.93% lru_gen_look_around
75 1.56% free_unref_page_list
76 1.40% memset
77
78 patch1-8
79 49.44% lzo1x_1_do_compress (real work)
80 6.19% page_vma_mapped_walk (overhead)
81 5.97% _raw_spin_unlock_irq
82 3.13% get_pfn_page
83 2.85% ptep_clear_flush
84 2.42% __zram_bvec_write
85 2.08% do_raw_spin_lock
86 1.92% memmove
87 1.44% alloc_zspage
88 1.36% memset
89
90 Configurations:
91 no change
92
93 Thanks to the following developers for their efforts [3].
94 kernel test robot <lkp@intel.com>
95
96 [1] https://lwn.net/Articles/23732/
97 [2] https://llvm.org/docs/ScudoHardenedAllocator.html
98 [3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/
99
100 Link: https://lkml.kernel.org/r/20220918080010.2920238-9-yuzhao@google.com
101 Signed-off-by: Yu Zhao <yuzhao@google.com>
102 Acked-by: Brian Geffon <bgeffon@google.com>
103 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
104 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
105 Acked-by: Steven Barrett <steven@liquorix.net>
106 Acked-by: Suleiman Souhlal <suleiman@google.com>
107 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
108 Tested-by: Donald Carr <d@chaos-reins.com>
109 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
110 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
111 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
112 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
113 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
114 Cc: Andi Kleen <ak@linux.intel.com>
115 Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
116 Cc: Barry Song <baohua@kernel.org>
117 Cc: Catalin Marinas <catalin.marinas@arm.com>
118 Cc: Dave Hansen <dave.hansen@linux.intel.com>
119 Cc: Hillf Danton <hdanton@sina.com>
120 Cc: Jens Axboe <axboe@kernel.dk>
121 Cc: Johannes Weiner <hannes@cmpxchg.org>
122 Cc: Jonathan Corbet <corbet@lwn.net>
123 Cc: Linus Torvalds <torvalds@linux-foundation.org>
124 Cc: Matthew Wilcox <willy@infradead.org>
125 Cc: Mel Gorman <mgorman@suse.de>
126 Cc: Miaohe Lin <linmiaohe@huawei.com>
127 Cc: Michael Larabel <Michael@MichaelLarabel.com>
128 Cc: Michal Hocko <mhocko@kernel.org>
129 Cc: Mike Rapoport <rppt@kernel.org>
130 Cc: Mike Rapoport <rppt@linux.ibm.com>
131 Cc: Peter Zijlstra <peterz@infradead.org>
132 Cc: Qi Zheng <zhengqi.arch@bytedance.com>
133 Cc: Tejun Heo <tj@kernel.org>
134 Cc: Vlastimil Babka <vbabka@suse.cz>
135 Cc: Will Deacon <will@kernel.org>
136 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
137 ---
138 fs/exec.c | 2 +
139 include/linux/memcontrol.h | 5 +
140 include/linux/mm_types.h | 76 +++
141 include/linux/mmzone.h | 56 +-
142 include/linux/swap.h | 4 +
143 kernel/exit.c | 1 +
144 kernel/fork.c | 9 +
145 kernel/sched/core.c | 1 +
146 mm/memcontrol.c | 25 +
147 mm/vmscan.c | 1010 +++++++++++++++++++++++++++++++++++-
148 10 files changed, 1172 insertions(+), 17 deletions(-)
149
150 diff --git a/fs/exec.c b/fs/exec.c
151 index 881390b44cfd..1afa15a07d26 100644
152 --- a/fs/exec.c
153 +++ b/fs/exec.c
154 @@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *mm)
155 active_mm = tsk->active_mm;
156 tsk->active_mm = mm;
157 tsk->mm = mm;
158 + lru_gen_add_mm(mm);
159 /*
160 * This prevents preemption while active_mm is being loaded and
161 * it and mm are being updated, which could cause problems for
162 @@ -1028,6 +1029,7 @@ static int exec_mmap(struct mm_struct *mm)
163 tsk->mm->vmacache_seqnum = 0;
164 vmacache_flush(tsk);
165 task_unlock(tsk);
166 + lru_gen_use_mm(mm);
167 if (old_mm) {
168 mmap_read_unlock(old_mm);
169 BUG_ON(active_mm != old_mm);
170 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
171 index 8d6a0329bc59..3736405cbcf6 100644
172 --- a/include/linux/memcontrol.h
173 +++ b/include/linux/memcontrol.h
174 @@ -348,6 +348,11 @@ struct mem_cgroup {
175 struct deferred_split deferred_split_queue;
176 #endif
177
178 +#ifdef CONFIG_LRU_GEN
179 + /* per-memcg mm_struct list */
180 + struct lru_gen_mm_list mm_list;
181 +#endif
182 +
183 struct mem_cgroup_per_node *nodeinfo[];
184 };
185
186 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
187 index 7f8ee09c711f..33c142d31261 100644
188 --- a/include/linux/mm_types.h
189 +++ b/include/linux/mm_types.h
190 @@ -580,6 +580,22 @@ struct mm_struct {
191 #ifdef CONFIG_IOMMU_SUPPORT
192 u32 pasid;
193 #endif
194 +#ifdef CONFIG_LRU_GEN
195 + struct {
196 + /* this mm_struct is on lru_gen_mm_list */
197 + struct list_head list;
198 + /*
199 + * Set when switching to this mm_struct, as a hint of
200 + * whether it has been used since the last time per-node
201 + * page table walkers cleared the corresponding bits.
202 + */
203 + unsigned long bitmap;
204 +#ifdef CONFIG_MEMCG
205 + /* points to the memcg of "owner" above */
206 + struct mem_cgroup *memcg;
207 +#endif
208 + } lru_gen;
209 +#endif /* CONFIG_LRU_GEN */
210 } __randomize_layout;
211
212 /*
213 @@ -606,6 +622,66 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
214 return (struct cpumask *)&mm->cpu_bitmap;
215 }
216
217 +#ifdef CONFIG_LRU_GEN
218 +
219 +struct lru_gen_mm_list {
220 + /* mm_struct list for page table walkers */
221 + struct list_head fifo;
222 + /* protects the list above */
223 + spinlock_t lock;
224 +};
225 +
226 +void lru_gen_add_mm(struct mm_struct *mm);
227 +void lru_gen_del_mm(struct mm_struct *mm);
228 +#ifdef CONFIG_MEMCG
229 +void lru_gen_migrate_mm(struct mm_struct *mm);
230 +#endif
231 +
232 +static inline void lru_gen_init_mm(struct mm_struct *mm)
233 +{
234 + INIT_LIST_HEAD(&mm->lru_gen.list);
235 + mm->lru_gen.bitmap = 0;
236 +#ifdef CONFIG_MEMCG
237 + mm->lru_gen.memcg = NULL;
238 +#endif
239 +}
240 +
241 +static inline void lru_gen_use_mm(struct mm_struct *mm)
242 +{
243 + /*
244 + * When the bitmap is set, page reclaim knows this mm_struct has been
245 + * used since the last time it cleared the bitmap. So it might be worth
246 + * walking the page tables of this mm_struct to clear the accessed bit.
247 + */
248 + WRITE_ONCE(mm->lru_gen.bitmap, -1);
249 +}
250 +
251 +#else /* !CONFIG_LRU_GEN */
252 +
253 +static inline void lru_gen_add_mm(struct mm_struct *mm)
254 +{
255 +}
256 +
257 +static inline void lru_gen_del_mm(struct mm_struct *mm)
258 +{
259 +}
260 +
261 +#ifdef CONFIG_MEMCG
262 +static inline void lru_gen_migrate_mm(struct mm_struct *mm)
263 +{
264 +}
265 +#endif
266 +
267 +static inline void lru_gen_init_mm(struct mm_struct *mm)
268 +{
269 +}
270 +
271 +static inline void lru_gen_use_mm(struct mm_struct *mm)
272 +{
273 +}
274 +
275 +#endif /* CONFIG_LRU_GEN */
276 +
277 struct mmu_gather;
278 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
279 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
280 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
281 index 4db2b877fcf9..659bab633bdf 100644
282 --- a/include/linux/mmzone.h
283 +++ b/include/linux/mmzone.h
284 @@ -385,7 +385,7 @@ enum {
285 * min_seq behind.
286 *
287 * The number of pages in each generation is eventually consistent and therefore
288 - * can be transiently negative.
289 + * can be transiently negative when reset_batch_size() is pending.
290 */
291 struct lru_gen_struct {
292 /* the aging increments the youngest generation number */
293 @@ -407,6 +407,53 @@ struct lru_gen_struct {
294 atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
295 };
296
297 +enum {
298 + MM_LEAF_TOTAL, /* total leaf entries */
299 + MM_LEAF_OLD, /* old leaf entries */
300 + MM_LEAF_YOUNG, /* young leaf entries */
301 + MM_NONLEAF_TOTAL, /* total non-leaf entries */
302 + MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
303 + MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
304 + NR_MM_STATS
305 +};
306 +
307 +/* double-buffering Bloom filters */
308 +#define NR_BLOOM_FILTERS 2
309 +
310 +struct lru_gen_mm_state {
311 + /* set to max_seq after each iteration */
312 + unsigned long seq;
313 + /* where the current iteration continues (inclusive) */
314 + struct list_head *head;
315 + /* where the last iteration ended (exclusive) */
316 + struct list_head *tail;
317 + /* to wait for the last page table walker to finish */
318 + struct wait_queue_head wait;
319 + /* Bloom filters flip after each iteration */
320 + unsigned long *filters[NR_BLOOM_FILTERS];
321 + /* the mm stats for debugging */
322 + unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
323 + /* the number of concurrent page table walkers */
324 + int nr_walkers;
325 +};
326 +
327 +struct lru_gen_mm_walk {
328 + /* the lruvec under reclaim */
329 + struct lruvec *lruvec;
330 + /* unstable max_seq from lru_gen_struct */
331 + unsigned long max_seq;
332 + /* the next address within an mm to scan */
333 + unsigned long next_addr;
334 + /* to batch promoted pages */
335 + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
336 + /* to batch the mm stats */
337 + int mm_stats[NR_MM_STATS];
338 + /* total batched items */
339 + int batched;
340 + bool can_swap;
341 + bool force_scan;
342 +};
343 +
344 void lru_gen_init_lruvec(struct lruvec *lruvec);
345 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
346
347 @@ -457,6 +504,8 @@ struct lruvec {
348 #ifdef CONFIG_LRU_GEN
349 /* evictable pages divided into generations */
350 struct lru_gen_struct lrugen;
351 + /* to concurrently iterate lru_gen_mm_list */
352 + struct lru_gen_mm_state mm_state;
353 #endif
354 #ifdef CONFIG_MEMCG
355 struct pglist_data *pgdat;
356 @@ -1042,6 +1091,11 @@ typedef struct pglist_data {
357
358 unsigned long flags;
359
360 +#ifdef CONFIG_LRU_GEN
361 + /* kswap mm walk data */
362 + struct lru_gen_mm_walk mm_walk;
363 +#endif
364 +
365 ZONE_PADDING(_pad2_)
366
367 /* Per-node vmstats */
368 diff --git a/include/linux/swap.h b/include/linux/swap.h
369 index 4efd267e2937..e970fca4f178 100644
370 --- a/include/linux/swap.h
371 +++ b/include/linux/swap.h
372 @@ -137,6 +137,10 @@ union swap_header {
373 */
374 struct reclaim_state {
375 unsigned long reclaimed_slab;
376 +#ifdef CONFIG_LRU_GEN
377 + /* per-thread mm walk data */
378 + struct lru_gen_mm_walk *mm_walk;
379 +#endif
380 };
381
382 #ifdef __KERNEL__
383 diff --git a/kernel/exit.c b/kernel/exit.c
384 index 80efdfda6662..06b477395012 100644
385 --- a/kernel/exit.c
386 +++ b/kernel/exit.c
387 @@ -469,6 +469,7 @@ void mm_update_next_owner(struct mm_struct *mm)
388 goto retry;
389 }
390 WRITE_ONCE(mm->owner, c);
391 + lru_gen_migrate_mm(mm);
392 task_unlock(c);
393 put_task_struct(c);
394 }
395 diff --git a/kernel/fork.c b/kernel/fork.c
396 index 68eab6ce3085..d8f37ecdde87 100644
397 --- a/kernel/fork.c
398 +++ b/kernel/fork.c
399 @@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
400 goto fail_nocontext;
401
402 mm->user_ns = get_user_ns(user_ns);
403 + lru_gen_init_mm(mm);
404 return mm;
405
406 fail_nocontext:
407 @@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_struct *mm)
408 }
409 if (mm->binfmt)
410 module_put(mm->binfmt->module);
411 + lru_gen_del_mm(mm);
412 mmdrop(mm);
413 }
414
415 @@ -2622,6 +2624,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
416 get_task_struct(p);
417 }
418
419 + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
420 + /* lock the task to synchronize with memcg migration */
421 + task_lock(p);
422 + lru_gen_add_mm(p->mm);
423 + task_unlock(p);
424 + }
425 +
426 wake_up_new_task(p);
427
428 /* forking complete and child started to run, tell ptracer */
429 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
430 index c1458fa8beb3..fe4d60474d4a 100644
431 --- a/kernel/sched/core.c
432 +++ b/kernel/sched/core.c
433 @@ -5007,6 +5007,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
434 * finish_task_switch()'s mmdrop().
435 */
436 switch_mm_irqs_off(prev->active_mm, next->mm, next);
437 + lru_gen_use_mm(next->mm);
438
439 if (!prev->mm) { // from kernel
440 /* will mmdrop() in finish_task_switch(). */
441 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
442 index cc3431c5d9ba..ed87d1256f0e 100644
443 --- a/mm/memcontrol.c
444 +++ b/mm/memcontrol.c
445 @@ -6212,6 +6212,30 @@ static void mem_cgroup_move_task(void)
446 }
447 #endif
448
449 +#ifdef CONFIG_LRU_GEN
450 +static void mem_cgroup_attach(struct cgroup_taskset *tset)
451 +{
452 + struct task_struct *task;
453 + struct cgroup_subsys_state *css;
454 +
455 + /* find the first leader if there is any */
456 + cgroup_taskset_for_each_leader(task, css, tset)
457 + break;
458 +
459 + if (!task)
460 + return;
461 +
462 + task_lock(task);
463 + if (task->mm && READ_ONCE(task->mm->owner) == task)
464 + lru_gen_migrate_mm(task->mm);
465 + task_unlock(task);
466 +}
467 +#else
468 +static void mem_cgroup_attach(struct cgroup_taskset *tset)
469 +{
470 +}
471 +#endif /* CONFIG_LRU_GEN */
472 +
473 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
474 {
475 if (value == PAGE_COUNTER_MAX)
476 @@ -6555,6 +6579,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
477 .css_reset = mem_cgroup_css_reset,
478 .css_rstat_flush = mem_cgroup_css_rstat_flush,
479 .can_attach = mem_cgroup_can_attach,
480 + .attach = mem_cgroup_attach,
481 .cancel_attach = mem_cgroup_cancel_attach,
482 .post_attach = mem_cgroup_move_task,
483 .dfl_cftypes = memory_files,
484 diff --git a/mm/vmscan.c b/mm/vmscan.c
485 index 1d0b25ae378c..a7844c689522 100644
486 --- a/mm/vmscan.c
487 +++ b/mm/vmscan.c
488 @@ -50,6 +50,8 @@
489 #include <linux/printk.h>
490 #include <linux/dax.h>
491 #include <linux/psi.h>
492 +#include <linux/pagewalk.h>
493 +#include <linux/shmem_fs.h>
494
495 #include <asm/tlbflush.h>
496 #include <asm/div64.h>
497 @@ -2853,7 +2855,7 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
498 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
499 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
500
501 -static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
502 +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
503 {
504 struct pglist_data *pgdat = NODE_DATA(nid);
505
506 @@ -2898,6 +2900,371 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
507 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
508 }
509
510 +/******************************************************************************
511 + * mm_struct list
512 + ******************************************************************************/
513 +
514 +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
515 +{
516 + static struct lru_gen_mm_list mm_list = {
517 + .fifo = LIST_HEAD_INIT(mm_list.fifo),
518 + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
519 + };
520 +
521 +#ifdef CONFIG_MEMCG
522 + if (memcg)
523 + return &memcg->mm_list;
524 +#endif
525 + VM_WARN_ON_ONCE(!mem_cgroup_disabled());
526 +
527 + return &mm_list;
528 +}
529 +
530 +void lru_gen_add_mm(struct mm_struct *mm)
531 +{
532 + int nid;
533 + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
534 + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
535 +
536 + VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
537 +#ifdef CONFIG_MEMCG
538 + VM_WARN_ON_ONCE(mm->lru_gen.memcg);
539 + mm->lru_gen.memcg = memcg;
540 +#endif
541 + spin_lock(&mm_list->lock);
542 +
543 + for_each_node_state(nid, N_MEMORY) {
544 + struct lruvec *lruvec = get_lruvec(memcg, nid);
545 +
546 + if (!lruvec)
547 + continue;
548 +
549 + /* the first addition since the last iteration */
550 + if (lruvec->mm_state.tail == &mm_list->fifo)
551 + lruvec->mm_state.tail = &mm->lru_gen.list;
552 + }
553 +
554 + list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
555 +
556 + spin_unlock(&mm_list->lock);
557 +}
558 +
559 +void lru_gen_del_mm(struct mm_struct *mm)
560 +{
561 + int nid;
562 + struct lru_gen_mm_list *mm_list;
563 + struct mem_cgroup *memcg = NULL;
564 +
565 + if (list_empty(&mm->lru_gen.list))
566 + return;
567 +
568 +#ifdef CONFIG_MEMCG
569 + memcg = mm->lru_gen.memcg;
570 +#endif
571 + mm_list = get_mm_list(memcg);
572 +
573 + spin_lock(&mm_list->lock);
574 +
575 + for_each_node(nid) {
576 + struct lruvec *lruvec = get_lruvec(memcg, nid);
577 +
578 + if (!lruvec)
579 + continue;
580 +
581 + /* where the last iteration ended (exclusive) */
582 + if (lruvec->mm_state.tail == &mm->lru_gen.list)
583 + lruvec->mm_state.tail = lruvec->mm_state.tail->next;
584 +
585 + /* where the current iteration continues (inclusive) */
586 + if (lruvec->mm_state.head != &mm->lru_gen.list)
587 + continue;
588 +
589 + lruvec->mm_state.head = lruvec->mm_state.head->next;
590 + /* the deletion ends the current iteration */
591 + if (lruvec->mm_state.head == &mm_list->fifo)
592 + WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
593 + }
594 +
595 + list_del_init(&mm->lru_gen.list);
596 +
597 + spin_unlock(&mm_list->lock);
598 +
599 +#ifdef CONFIG_MEMCG
600 + mem_cgroup_put(mm->lru_gen.memcg);
601 + mm->lru_gen.memcg = NULL;
602 +#endif
603 +}
604 +
605 +#ifdef CONFIG_MEMCG
606 +void lru_gen_migrate_mm(struct mm_struct *mm)
607 +{
608 + struct mem_cgroup *memcg;
609 + struct task_struct *task = rcu_dereference_protected(mm->owner, true);
610 +
611 + VM_WARN_ON_ONCE(task->mm != mm);
612 + lockdep_assert_held(&task->alloc_lock);
613 +
614 + /* for mm_update_next_owner() */
615 + if (mem_cgroup_disabled())
616 + return;
617 +
618 + rcu_read_lock();
619 + memcg = mem_cgroup_from_task(task);
620 + rcu_read_unlock();
621 + if (memcg == mm->lru_gen.memcg)
622 + return;
623 +
624 + VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
625 + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
626 +
627 + lru_gen_del_mm(mm);
628 + lru_gen_add_mm(mm);
629 +}
630 +#endif
631 +
632 +/*
633 + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
634 + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
635 + * bits in a bitmap, k is the number of hash functions and n is the number of
636 + * inserted items.
637 + *
638 + * Page table walkers use one of the two filters to reduce their search space.
639 + * To get rid of non-leaf entries that no longer have enough leaf entries, the
640 + * aging uses the double-buffering technique to flip to the other filter each
641 + * time it produces a new generation. For non-leaf entries that have enough
642 + * leaf entries, the aging carries them over to the next generation in
643 + * walk_pmd_range(); the eviction also report them when walking the rmap
644 + * in lru_gen_look_around().
645 + *
646 + * For future optimizations:
647 + * 1. It's not necessary to keep both filters all the time. The spare one can be
648 + * freed after the RCU grace period and reallocated if needed again.
649 + * 2. And when reallocating, it's worth scaling its size according to the number
650 + * of inserted entries in the other filter, to reduce the memory overhead on
651 + * small systems and false positives on large systems.
652 + * 3. Jenkins' hash function is an alternative to Knuth's.
653 + */
654 +#define BLOOM_FILTER_SHIFT 15
655 +
656 +static inline int filter_gen_from_seq(unsigned long seq)
657 +{
658 + return seq % NR_BLOOM_FILTERS;
659 +}
660 +
661 +static void get_item_key(void *item, int *key)
662 +{
663 + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
664 +
665 + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
666 +
667 + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
668 + key[1] = hash >> BLOOM_FILTER_SHIFT;
669 +}
670 +
671 +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
672 +{
673 + unsigned long *filter;
674 + int gen = filter_gen_from_seq(seq);
675 +
676 + filter = lruvec->mm_state.filters[gen];
677 + if (filter) {
678 + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
679 + return;
680 + }
681 +
682 + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
683 + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
684 + WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
685 +}
686 +
687 +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
688 +{
689 + int key[2];
690 + unsigned long *filter;
691 + int gen = filter_gen_from_seq(seq);
692 +
693 + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
694 + if (!filter)
695 + return;
696 +
697 + get_item_key(item, key);
698 +
699 + if (!test_bit(key[0], filter))
700 + set_bit(key[0], filter);
701 + if (!test_bit(key[1], filter))
702 + set_bit(key[1], filter);
703 +}
704 +
705 +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
706 +{
707 + int key[2];
708 + unsigned long *filter;
709 + int gen = filter_gen_from_seq(seq);
710 +
711 + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
712 + if (!filter)
713 + return true;
714 +
715 + get_item_key(item, key);
716 +
717 + return test_bit(key[0], filter) && test_bit(key[1], filter);
718 +}
719 +
720 +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
721 +{
722 + int i;
723 + int hist;
724 +
725 + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
726 +
727 + if (walk) {
728 + hist = lru_hist_from_seq(walk->max_seq);
729 +
730 + for (i = 0; i < NR_MM_STATS; i++) {
731 + WRITE_ONCE(lruvec->mm_state.stats[hist][i],
732 + lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
733 + walk->mm_stats[i] = 0;
734 + }
735 + }
736 +
737 + if (NR_HIST_GENS > 1 && last) {
738 + hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
739 +
740 + for (i = 0; i < NR_MM_STATS; i++)
741 + WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
742 + }
743 +}
744 +
745 +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
746 +{
747 + int type;
748 + unsigned long size = 0;
749 + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
750 + int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
751 +
752 + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
753 + return true;
754 +
755 + clear_bit(key, &mm->lru_gen.bitmap);
756 +
757 + for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
758 + size += type ? get_mm_counter(mm, MM_FILEPAGES) :
759 + get_mm_counter(mm, MM_ANONPAGES) +
760 + get_mm_counter(mm, MM_SHMEMPAGES);
761 + }
762 +
763 + if (size < MIN_LRU_BATCH)
764 + return true;
765 +
766 + return !mmget_not_zero(mm);
767 +}
768 +
769 +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
770 + struct mm_struct **iter)
771 +{
772 + bool first = false;
773 + bool last = true;
774 + struct mm_struct *mm = NULL;
775 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
776 + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
777 + struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
778 +
779 + /*
780 + * There are four interesting cases for this page table walker:
781 + * 1. It tries to start a new iteration of mm_list with a stale max_seq;
782 + * there is nothing left to do.
783 + * 2. It's the first of the current generation, and it needs to reset
784 + * the Bloom filter for the next generation.
785 + * 3. It reaches the end of mm_list, and it needs to increment
786 + * mm_state->seq; the iteration is done.
787 + * 4. It's the last of the current generation, and it needs to reset the
788 + * mm stats counters for the next generation.
789 + */
790 + spin_lock(&mm_list->lock);
791 +
792 + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
793 + VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
794 + VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
795 +
796 + if (walk->max_seq <= mm_state->seq) {
797 + if (!*iter)
798 + last = false;
799 + goto done;
800 + }
801 +
802 + if (!mm_state->nr_walkers) {
803 + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
804 +
805 + mm_state->head = mm_list->fifo.next;
806 + first = true;
807 + }
808 +
809 + while (!mm && mm_state->head != &mm_list->fifo) {
810 + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
811 +
812 + mm_state->head = mm_state->head->next;
813 +
814 + /* force scan for those added after the last iteration */
815 + if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
816 + mm_state->tail = mm_state->head;
817 + walk->force_scan = true;
818 + }
819 +
820 + if (should_skip_mm(mm, walk))
821 + mm = NULL;
822 + }
823 +
824 + if (mm_state->head == &mm_list->fifo)
825 + WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
826 +done:
827 + if (*iter && !mm)
828 + mm_state->nr_walkers--;
829 + if (!*iter && mm)
830 + mm_state->nr_walkers++;
831 +
832 + if (mm_state->nr_walkers)
833 + last = false;
834 +
835 + if (*iter || last)
836 + reset_mm_stats(lruvec, walk, last);
837 +
838 + spin_unlock(&mm_list->lock);
839 +
840 + if (mm && first)
841 + reset_bloom_filter(lruvec, walk->max_seq + 1);
842 +
843 + if (*iter)
844 + mmput_async(*iter);
845 +
846 + *iter = mm;
847 +
848 + return last;
849 +}
850 +
851 +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
852 +{
853 + bool success = false;
854 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
855 + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
856 + struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
857 +
858 + spin_lock(&mm_list->lock);
859 +
860 + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
861 +
862 + if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
863 + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
864 +
865 + WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
866 + reset_mm_stats(lruvec, NULL, true);
867 + success = true;
868 + }
869 +
870 + spin_unlock(&mm_list->lock);
871 +
872 + return success;
873 +}
874 +
875 /******************************************************************************
876 * refault feedback loop
877 ******************************************************************************/
878 @@ -3048,6 +3415,118 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin
879 return new_gen;
880 }
881
882 +static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page,
883 + int old_gen, int new_gen)
884 +{
885 + int type = page_is_file_lru(page);
886 + int zone = page_zonenum(page);
887 + int delta = thp_nr_pages(page);
888 +
889 + VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
890 + VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
891 +
892 + walk->batched++;
893 +
894 + walk->nr_pages[old_gen][type][zone] -= delta;
895 + walk->nr_pages[new_gen][type][zone] += delta;
896 +}
897 +
898 +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
899 +{
900 + int gen, type, zone;
901 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
902 +
903 + walk->batched = 0;
904 +
905 + for_each_gen_type_zone(gen, type, zone) {
906 + enum lru_list lru = type * LRU_INACTIVE_FILE;
907 + int delta = walk->nr_pages[gen][type][zone];
908 +
909 + if (!delta)
910 + continue;
911 +
912 + walk->nr_pages[gen][type][zone] = 0;
913 + WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
914 + lrugen->nr_pages[gen][type][zone] + delta);
915 +
916 + if (lru_gen_is_active(lruvec, gen))
917 + lru += LRU_ACTIVE;
918 + __update_lru_size(lruvec, lru, zone, delta);
919 + }
920 +}
921 +
922 +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
923 +{
924 + struct address_space *mapping;
925 + struct vm_area_struct *vma = args->vma;
926 + struct lru_gen_mm_walk *walk = args->private;
927 +
928 + if (!vma_is_accessible(vma))
929 + return true;
930 +
931 + if (is_vm_hugetlb_page(vma))
932 + return true;
933 +
934 + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
935 + return true;
936 +
937 + if (vma == get_gate_vma(vma->vm_mm))
938 + return true;
939 +
940 + if (vma_is_anonymous(vma))
941 + return !walk->can_swap;
942 +
943 + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
944 + return true;
945 +
946 + mapping = vma->vm_file->f_mapping;
947 + if (mapping_unevictable(mapping))
948 + return true;
949 +
950 + if (shmem_mapping(mapping))
951 + return !walk->can_swap;
952 +
953 + /* to exclude special mappings like dax, etc. */
954 + return !mapping->a_ops->readpage;
955 +}
956 +
957 +/*
958 + * Some userspace memory allocators map many single-page VMAs. Instead of
959 + * returning back to the PGD table for each of such VMAs, finish an entire PMD
960 + * table to reduce zigzags and improve cache performance.
961 + */
962 +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
963 + unsigned long *vm_start, unsigned long *vm_end)
964 +{
965 + unsigned long start = round_up(*vm_end, size);
966 + unsigned long end = (start | ~mask) + 1;
967 +
968 + VM_WARN_ON_ONCE(mask & size);
969 + VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
970 +
971 + while (args->vma) {
972 + if (start >= args->vma->vm_end) {
973 + args->vma = args->vma->vm_next;
974 + continue;
975 + }
976 +
977 + if (end && end <= args->vma->vm_start)
978 + return false;
979 +
980 + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) {
981 + args->vma = args->vma->vm_next;
982 + continue;
983 + }
984 +
985 + *vm_start = max(start, args->vma->vm_start);
986 + *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
987 +
988 + return true;
989 + }
990 +
991 + return false;
992 +}
993 +
994 static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
995 {
996 unsigned long pfn = pte_pfn(pte);
997 @@ -3066,8 +3545,28 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
998 return pfn;
999 }
1000
1001 +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
1002 +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
1003 +{
1004 + unsigned long pfn = pmd_pfn(pmd);
1005 +
1006 + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
1007 +
1008 + if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
1009 + return -1;
1010 +
1011 + if (WARN_ON_ONCE(pmd_devmap(pmd)))
1012 + return -1;
1013 +
1014 + if (WARN_ON_ONCE(!pfn_valid(pfn)))
1015 + return -1;
1016 +
1017 + return pfn;
1018 +}
1019 +#endif
1020 +
1021 static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
1022 - struct pglist_data *pgdat)
1023 + struct pglist_data *pgdat, bool can_swap)
1024 {
1025 struct page *page;
1026
1027 @@ -3082,9 +3581,375 @@ static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
1028 if (page_memcg_rcu(page) != memcg)
1029 return NULL;
1030
1031 + /* file VMAs can contain anon pages from COW */
1032 + if (!page_is_file_lru(page) && !can_swap)
1033 + return NULL;
1034 +
1035 return page;
1036 }
1037
1038 +static bool suitable_to_scan(int total, int young)
1039 +{
1040 + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
1041 +
1042 + /* suitable if the average number of young PTEs per cacheline is >=1 */
1043 + return young * n >= total;
1044 +}
1045 +
1046 +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
1047 + struct mm_walk *args)
1048 +{
1049 + int i;
1050 + pte_t *pte;
1051 + spinlock_t *ptl;
1052 + unsigned long addr;
1053 + int total = 0;
1054 + int young = 0;
1055 + struct lru_gen_mm_walk *walk = args->private;
1056 + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
1057 + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
1058 + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
1059 +
1060 + VM_WARN_ON_ONCE(pmd_leaf(*pmd));
1061 +
1062 + ptl = pte_lockptr(args->mm, pmd);
1063 + if (!spin_trylock(ptl))
1064 + return false;
1065 +
1066 + arch_enter_lazy_mmu_mode();
1067 +
1068 + pte = pte_offset_map(pmd, start & PMD_MASK);
1069 +restart:
1070 + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
1071 + unsigned long pfn;
1072 + struct page *page;
1073 +
1074 + total++;
1075 + walk->mm_stats[MM_LEAF_TOTAL]++;
1076 +
1077 + pfn = get_pte_pfn(pte[i], args->vma, addr);
1078 + if (pfn == -1)
1079 + continue;
1080 +
1081 + if (!pte_young(pte[i])) {
1082 + walk->mm_stats[MM_LEAF_OLD]++;
1083 + continue;
1084 + }
1085 +
1086 + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
1087 + if (!page)
1088 + continue;
1089 +
1090 + if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
1091 + VM_WARN_ON_ONCE(true);
1092 +
1093 + young++;
1094 + walk->mm_stats[MM_LEAF_YOUNG]++;
1095 +
1096 + if (pte_dirty(pte[i]) && !PageDirty(page) &&
1097 + !(PageAnon(page) && PageSwapBacked(page) &&
1098 + !PageSwapCache(page)))
1099 + set_page_dirty(page);
1100 +
1101 + old_gen = page_update_gen(page, new_gen);
1102 + if (old_gen >= 0 && old_gen != new_gen)
1103 + update_batch_size(walk, page, old_gen, new_gen);
1104 + }
1105 +
1106 + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
1107 + goto restart;
1108 +
1109 + pte_unmap(pte);
1110 +
1111 + arch_leave_lazy_mmu_mode();
1112 + spin_unlock(ptl);
1113 +
1114 + return suitable_to_scan(total, young);
1115 +}
1116 +
1117 +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
1118 +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
1119 + struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
1120 +{
1121 + int i;
1122 + pmd_t *pmd;
1123 + spinlock_t *ptl;
1124 + struct lru_gen_mm_walk *walk = args->private;
1125 + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
1126 + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
1127 + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
1128 +
1129 + VM_WARN_ON_ONCE(pud_leaf(*pud));
1130 +
1131 + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
1132 + if (*start == -1) {
1133 + *start = next;
1134 + return;
1135 + }
1136 +
1137 + i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
1138 + if (i && i <= MIN_LRU_BATCH) {
1139 + __set_bit(i - 1, bitmap);
1140 + return;
1141 + }
1142 +
1143 + pmd = pmd_offset(pud, *start);
1144 +
1145 + ptl = pmd_lockptr(args->mm, pmd);
1146 + if (!spin_trylock(ptl))
1147 + goto done;
1148 +
1149 + arch_enter_lazy_mmu_mode();
1150 +
1151 + do {
1152 + unsigned long pfn;
1153 + struct page *page;
1154 + unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
1155 +
1156 + pfn = get_pmd_pfn(pmd[i], vma, addr);
1157 + if (pfn == -1)
1158 + goto next;
1159 +
1160 + if (!pmd_trans_huge(pmd[i])) {
1161 + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
1162 + pmdp_test_and_clear_young(vma, addr, pmd + i);
1163 + goto next;
1164 + }
1165 +
1166 + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
1167 + if (!page)
1168 + goto next;
1169 +
1170 + if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
1171 + goto next;
1172 +
1173 + walk->mm_stats[MM_LEAF_YOUNG]++;
1174 +
1175 + if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
1176 + !(PageAnon(page) && PageSwapBacked(page) &&
1177 + !PageSwapCache(page)))
1178 + set_page_dirty(page);
1179 +
1180 + old_gen = page_update_gen(page, new_gen);
1181 + if (old_gen >= 0 && old_gen != new_gen)
1182 + update_batch_size(walk, page, old_gen, new_gen);
1183 +next:
1184 + i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
1185 + } while (i <= MIN_LRU_BATCH);
1186 +
1187 + arch_leave_lazy_mmu_mode();
1188 + spin_unlock(ptl);
1189 +done:
1190 + *start = -1;
1191 + bitmap_zero(bitmap, MIN_LRU_BATCH);
1192 +}
1193 +#else
1194 +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
1195 + struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
1196 +{
1197 +}
1198 +#endif
1199 +
1200 +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
1201 + struct mm_walk *args)
1202 +{
1203 + int i;
1204 + pmd_t *pmd;
1205 + unsigned long next;
1206 + unsigned long addr;
1207 + struct vm_area_struct *vma;
1208 + unsigned long pos = -1;
1209 + struct lru_gen_mm_walk *walk = args->private;
1210 + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
1211 +
1212 + VM_WARN_ON_ONCE(pud_leaf(*pud));
1213 +
1214 + /*
1215 + * Finish an entire PMD in two passes: the first only reaches to PTE
1216 + * tables to avoid taking the PMD lock; the second, if necessary, takes
1217 + * the PMD lock to clear the accessed bit in PMD entries.
1218 + */
1219 + pmd = pmd_offset(pud, start & PUD_MASK);
1220 +restart:
1221 + /* walk_pte_range() may call get_next_vma() */
1222 + vma = args->vma;
1223 + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
1224 + pmd_t val = pmd_read_atomic(pmd + i);
1225 +
1226 + /* for pmd_read_atomic() */
1227 + barrier();
1228 +
1229 + next = pmd_addr_end(addr, end);
1230 +
1231 + if (!pmd_present(val) || is_huge_zero_pmd(val)) {
1232 + walk->mm_stats[MM_LEAF_TOTAL]++;
1233 + continue;
1234 + }
1235 +
1236 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1237 + if (pmd_trans_huge(val)) {
1238 + unsigned long pfn = pmd_pfn(val);
1239 + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
1240 +
1241 + walk->mm_stats[MM_LEAF_TOTAL]++;
1242 +
1243 + if (!pmd_young(val)) {
1244 + walk->mm_stats[MM_LEAF_OLD]++;
1245 + continue;
1246 + }
1247 +
1248 + /* try to avoid unnecessary memory loads */
1249 + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
1250 + continue;
1251 +
1252 + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
1253 + continue;
1254 + }
1255 +#endif
1256 + walk->mm_stats[MM_NONLEAF_TOTAL]++;
1257 +
1258 +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
1259 + if (!pmd_young(val))
1260 + continue;
1261 +
1262 + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
1263 +#endif
1264 + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
1265 + continue;
1266 +
1267 + walk->mm_stats[MM_NONLEAF_FOUND]++;
1268 +
1269 + if (!walk_pte_range(&val, addr, next, args))
1270 + continue;
1271 +
1272 + walk->mm_stats[MM_NONLEAF_ADDED]++;
1273 +
1274 + /* carry over to the next generation */
1275 + update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
1276 + }
1277 +
1278 + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
1279 +
1280 + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
1281 + goto restart;
1282 +}
1283 +
1284 +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
1285 + struct mm_walk *args)
1286 +{
1287 + int i;
1288 + pud_t *pud;
1289 + unsigned long addr;
1290 + unsigned long next;
1291 + struct lru_gen_mm_walk *walk = args->private;
1292 +
1293 + VM_WARN_ON_ONCE(p4d_leaf(*p4d));
1294 +
1295 + pud = pud_offset(p4d, start & P4D_MASK);
1296 +restart:
1297 + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
1298 + pud_t val = READ_ONCE(pud[i]);
1299 +
1300 + next = pud_addr_end(addr, end);
1301 +
1302 + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
1303 + continue;
1304 +
1305 + walk_pmd_range(&val, addr, next, args);
1306 +
1307 + /* a racy check to curtail the waiting time */
1308 + if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
1309 + return 1;
1310 +
1311 + if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
1312 + end = (addr | ~PUD_MASK) + 1;
1313 + goto done;
1314 + }
1315 + }
1316 +
1317 + if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
1318 + goto restart;
1319 +
1320 + end = round_up(end, P4D_SIZE);
1321 +done:
1322 + if (!end || !args->vma)
1323 + return 1;
1324 +
1325 + walk->next_addr = max(end, args->vma->vm_start);
1326 +
1327 + return -EAGAIN;
1328 +}
1329 +
1330 +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
1331 +{
1332 + static const struct mm_walk_ops mm_walk_ops = {
1333 + .test_walk = should_skip_vma,
1334 + .p4d_entry = walk_pud_range,
1335 + };
1336 +
1337 + int err;
1338 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
1339 +
1340 + walk->next_addr = FIRST_USER_ADDRESS;
1341 +
1342 + do {
1343 + err = -EBUSY;
1344 +
1345 + /* page_update_gen() requires stable page_memcg() */
1346 + if (!mem_cgroup_trylock_pages(memcg))
1347 + break;
1348 +
1349 + /* the caller might be holding the lock for write */
1350 + if (mmap_read_trylock(mm)) {
1351 + err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
1352 +
1353 + mmap_read_unlock(mm);
1354 + }
1355 +
1356 + mem_cgroup_unlock_pages();
1357 +
1358 + if (walk->batched) {
1359 + spin_lock_irq(&lruvec->lru_lock);
1360 + reset_batch_size(lruvec, walk);
1361 + spin_unlock_irq(&lruvec->lru_lock);
1362 + }
1363 +
1364 + cond_resched();
1365 + } while (err == -EAGAIN);
1366 +}
1367 +
1368 +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
1369 +{
1370 + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
1371 +
1372 + if (pgdat && current_is_kswapd()) {
1373 + VM_WARN_ON_ONCE(walk);
1374 +
1375 + walk = &pgdat->mm_walk;
1376 + } else if (!pgdat && !walk) {
1377 + VM_WARN_ON_ONCE(current_is_kswapd());
1378 +
1379 + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
1380 + }
1381 +
1382 + current->reclaim_state->mm_walk = walk;
1383 +
1384 + return walk;
1385 +}
1386 +
1387 +static void clear_mm_walk(void)
1388 +{
1389 + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
1390 +
1391 + VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
1392 + VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
1393 +
1394 + current->reclaim_state->mm_walk = NULL;
1395 +
1396 + if (!current_is_kswapd())
1397 + kfree(walk);
1398 +}
1399 +
1400 static void inc_min_seq(struct lruvec *lruvec, int type)
1401 {
1402 struct lru_gen_struct *lrugen = &lruvec->lrugen;
1403 @@ -3136,7 +4001,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
1404 return success;
1405 }
1406
1407 -static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
1408 +static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
1409 {
1410 int prev, next;
1411 int type, zone;
1412 @@ -3146,9 +4011,6 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s
1413
1414 VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
1415
1416 - if (max_seq != lrugen->max_seq)
1417 - goto unlock;
1418 -
1419 for (type = ANON_AND_FILE - 1; type >= 0; type--) {
1420 if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
1421 continue;
1422 @@ -3186,10 +4048,76 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s
1423
1424 /* make sure preceding modifications appear */
1425 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
1426 -unlock:
1427 +
1428 spin_unlock_irq(&lruvec->lru_lock);
1429 }
1430
1431 +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
1432 + struct scan_control *sc, bool can_swap)
1433 +{
1434 + bool success;
1435 + struct lru_gen_mm_walk *walk;
1436 + struct mm_struct *mm = NULL;
1437 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
1438 +
1439 + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
1440 +
1441 + /* see the comment in iterate_mm_list() */
1442 + if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
1443 + success = false;
1444 + goto done;
1445 + }
1446 +
1447 + /*
1448 + * If the hardware doesn't automatically set the accessed bit, fallback
1449 + * to lru_gen_look_around(), which only clears the accessed bit in a
1450 + * handful of PTEs. Spreading the work out over a period of time usually
1451 + * is less efficient, but it avoids bursty page faults.
1452 + */
1453 + if (!arch_has_hw_pte_young()) {
1454 + success = iterate_mm_list_nowalk(lruvec, max_seq);
1455 + goto done;
1456 + }
1457 +
1458 + walk = set_mm_walk(NULL);
1459 + if (!walk) {
1460 + success = iterate_mm_list_nowalk(lruvec, max_seq);
1461 + goto done;
1462 + }
1463 +
1464 + walk->lruvec = lruvec;
1465 + walk->max_seq = max_seq;
1466 + walk->can_swap = can_swap;
1467 + walk->force_scan = false;
1468 +
1469 + do {
1470 + success = iterate_mm_list(lruvec, walk, &mm);
1471 + if (mm)
1472 + walk_mm(lruvec, mm, walk);
1473 +
1474 + cond_resched();
1475 + } while (mm);
1476 +done:
1477 + if (!success) {
1478 + if (sc->priority <= DEF_PRIORITY - 2)
1479 + wait_event_killable(lruvec->mm_state.wait,
1480 + max_seq < READ_ONCE(lrugen->max_seq));
1481 +
1482 + return max_seq < READ_ONCE(lrugen->max_seq);
1483 + }
1484 +
1485 + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
1486 +
1487 + inc_max_seq(lruvec, can_swap);
1488 + /* either this sees any waiters or they will see updated max_seq */
1489 + if (wq_has_sleeper(&lruvec->mm_state.wait))
1490 + wake_up_all(&lruvec->mm_state.wait);
1491 +
1492 + wakeup_flusher_threads(WB_REASON_VMSCAN);
1493 +
1494 + return true;
1495 +}
1496 +
1497 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
1498 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
1499 {
1500 @@ -3265,7 +4193,7 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1501
1502 need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
1503 if (need_aging)
1504 - inc_max_seq(lruvec, max_seq, swappiness);
1505 + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
1506 }
1507
1508 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
1509 @@ -3274,6 +4202,8 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
1510
1511 VM_WARN_ON_ONCE(!current_is_kswapd());
1512
1513 + set_mm_walk(pgdat);
1514 +
1515 memcg = mem_cgroup_iter(NULL, NULL, NULL);
1516 do {
1517 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
1518 @@ -3282,11 +4212,16 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
1519
1520 cond_resched();
1521 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
1522 +
1523 + clear_mm_walk();
1524 }
1525
1526 /*
1527 * This function exploits spatial locality when shrink_page_list() walks the
1528 - * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
1529 + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
1530 + * the scan was done cacheline efficiently, it adds the PMD entry pointing to
1531 + * the PTE table to the Bloom filter. This forms a feedback loop between the
1532 + * eviction and the aging.
1533 */
1534 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
1535 {
1536 @@ -3295,6 +4230,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
1537 unsigned long start;
1538 unsigned long end;
1539 unsigned long addr;
1540 + struct lru_gen_mm_walk *walk;
1541 + int young = 0;
1542 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
1543 struct page *page = pvmw->page;
1544 struct mem_cgroup *memcg = page_memcg(page);
1545 @@ -3309,6 +4246,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
1546 if (spin_is_contended(pvmw->ptl))
1547 return;
1548
1549 + /* avoid taking the LRU lock under the PTL when possible */
1550 + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
1551 +
1552 start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
1553 end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
1554
1555 @@ -3338,13 +4278,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
1556 if (!pte_young(pte[i]))
1557 continue;
1558
1559 - page = get_pfn_page(pfn, memcg, pgdat);
1560 + page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap);
1561 if (!page)
1562 continue;
1563
1564 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
1565 VM_WARN_ON_ONCE(true);
1566
1567 + young++;
1568 +
1569 if (pte_dirty(pte[i]) && !PageDirty(page) &&
1570 !(PageAnon(page) && PageSwapBacked(page) &&
1571 !PageSwapCache(page)))
1572 @@ -3360,7 +4302,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
1573 arch_leave_lazy_mmu_mode();
1574 rcu_read_unlock();
1575
1576 - if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
1577 + /* feedback from rmap walkers to page table walkers */
1578 + if (suitable_to_scan(i, young))
1579 + update_bloom_filter(lruvec, max_seq, pvmw->pmd);
1580 +
1581 + if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
1582 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
1583 page = pte_page(pte[i]);
1584 activate_page(page);
1585 @@ -3372,8 +4318,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
1586 if (!mem_cgroup_trylock_pages(memcg))
1587 return;
1588
1589 - spin_lock_irq(&lruvec->lru_lock);
1590 - new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
1591 + if (!walk) {
1592 + spin_lock_irq(&lruvec->lru_lock);
1593 + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
1594 + }
1595
1596 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
1597 page = compound_head(pte_page(pte[i]));
1598 @@ -3384,10 +4332,14 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
1599 if (old_gen < 0 || old_gen == new_gen)
1600 continue;
1601
1602 - lru_gen_update_size(lruvec, page, old_gen, new_gen);
1603 + if (walk)
1604 + update_batch_size(walk, page, old_gen, new_gen);
1605 + else
1606 + lru_gen_update_size(lruvec, page, old_gen, new_gen);
1607 }
1608
1609 - spin_unlock_irq(&lruvec->lru_lock);
1610 + if (!walk)
1611 + spin_unlock_irq(&lruvec->lru_lock);
1612
1613 mem_cgroup_unlock_pages();
1614 }
1615 @@ -3670,6 +4622,7 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
1616 struct page *page;
1617 enum vm_event_item item;
1618 struct reclaim_stat stat;
1619 + struct lru_gen_mm_walk *walk;
1620 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
1621 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1622
1623 @@ -3706,6 +4659,10 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
1624
1625 move_pages_to_lru(lruvec, &list);
1626
1627 + walk = current->reclaim_state->mm_walk;
1628 + if (walk && walk->batched)
1629 + reset_batch_size(lruvec, walk);
1630 +
1631 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
1632 if (!cgroup_reclaim(sc))
1633 __count_vm_events(item, reclaimed);
1634 @@ -3722,6 +4679,11 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
1635 return scanned;
1636 }
1637
1638 +/*
1639 + * For future optimizations:
1640 + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
1641 + * reclaim.
1642 + */
1643 static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
1644 bool can_swap)
1645 {
1646 @@ -3747,7 +4709,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
1647 if (current_is_kswapd())
1648 return 0;
1649
1650 - inc_max_seq(lruvec, max_seq, can_swap);
1651 + if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
1652 + return nr_to_scan;
1653 done:
1654 return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
1655 }
1656 @@ -3761,6 +4724,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
1657
1658 blk_start_plug(&plug);
1659
1660 + set_mm_walk(lruvec_pgdat(lruvec));
1661 +
1662 while (true) {
1663 int delta;
1664 int swappiness;
1665 @@ -3788,6 +4753,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
1666 cond_resched();
1667 }
1668
1669 + clear_mm_walk();
1670 +
1671 blk_finish_plug(&plug);
1672 }
1673
1674 @@ -3804,15 +4771,21 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
1675
1676 for_each_gen_type_zone(gen, type, zone)
1677 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
1678 +
1679 + lruvec->mm_state.seq = MIN_NR_GENS;
1680 + init_waitqueue_head(&lruvec->mm_state.wait);
1681 }
1682
1683 #ifdef CONFIG_MEMCG
1684 void lru_gen_init_memcg(struct mem_cgroup *memcg)
1685 {
1686 + INIT_LIST_HEAD(&memcg->mm_list.fifo);
1687 + spin_lock_init(&memcg->mm_list.lock);
1688 }
1689
1690 void lru_gen_exit_memcg(struct mem_cgroup *memcg)
1691 {
1692 + int i;
1693 int nid;
1694
1695 for_each_node(nid) {
1696 @@ -3820,6 +4793,11 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
1697
1698 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
1699 sizeof(lruvec->lrugen.nr_pages)));
1700 +
1701 + for (i = 0; i < NR_BLOOM_FILTERS; i++) {
1702 + bitmap_free(lruvec->mm_state.filters[i]);
1703 + lruvec->mm_state.filters[i] = NULL;
1704 + }
1705 }
1706 }
1707 #endif
1708 --
1709 2.40.0
1710