kernel: Update MGLRU patchset
[openwrt/staging/dedeckeh.git] / target / linux / generic / backport-5.15 / 020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch
1 From 640db3a029dca909af47157ca18f52b29d34a1b9 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Sun, 18 Sep 2022 02:00:07 -0600
4 Subject: [PATCH 10/29] mm: multi-gen LRU: kill switch
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
10 can be disabled include:
11 0x0001: the multi-gen LRU core
12 0x0002: walking page table, when arch_has_hw_pte_young() returns
13 true
14 0x0004: clearing the accessed bit in non-leaf PMD entries, when
15 CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
16 [yYnN]: apply to all the components above
17 E.g.,
18 echo y >/sys/kernel/mm/lru_gen/enabled
19 cat /sys/kernel/mm/lru_gen/enabled
20 0x0007
21 echo 5 >/sys/kernel/mm/lru_gen/enabled
22 cat /sys/kernel/mm/lru_gen/enabled
23 0x0005
24
25 NB: the page table walks happen on the scale of seconds under heavy memory
26 pressure, in which case the mmap_lock contention is a lesser concern,
27 compared with the LRU lock contention and the I/O congestion. So far the
28 only well-known case of the mmap_lock contention happens on Android, due
29 to Scudo [1] which allocates several thousand VMAs for merely a few
30 hundred MBs. The SPF and the Maple Tree also have provided their own
31 assessments [2][3]. However, if walking page tables does worsen the
32 mmap_lock contention, the kill switch can be used to disable it. In this
33 case the multi-gen LRU will suffer a minor performance degradation, as
34 shown previously.
35
36 Clearing the accessed bit in non-leaf PMD entries can also be disabled,
37 since this behavior was not tested on x86 varieties other than Intel and
38 AMD.
39
40 [1] https://source.android.com/devices/tech/debug/scudo
41 [2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
42 [3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
43
44 Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com
45 Signed-off-by: Yu Zhao <yuzhao@google.com>
46 Acked-by: Brian Geffon <bgeffon@google.com>
47 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
48 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
49 Acked-by: Steven Barrett <steven@liquorix.net>
50 Acked-by: Suleiman Souhlal <suleiman@google.com>
51 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
52 Tested-by: Donald Carr <d@chaos-reins.com>
53 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
54 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
55 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
56 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
57 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
58 Cc: Andi Kleen <ak@linux.intel.com>
59 Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
60 Cc: Barry Song <baohua@kernel.org>
61 Cc: Catalin Marinas <catalin.marinas@arm.com>
62 Cc: Dave Hansen <dave.hansen@linux.intel.com>
63 Cc: Hillf Danton <hdanton@sina.com>
64 Cc: Jens Axboe <axboe@kernel.dk>
65 Cc: Johannes Weiner <hannes@cmpxchg.org>
66 Cc: Jonathan Corbet <corbet@lwn.net>
67 Cc: Linus Torvalds <torvalds@linux-foundation.org>
68 Cc: Matthew Wilcox <willy@infradead.org>
69 Cc: Mel Gorman <mgorman@suse.de>
70 Cc: Miaohe Lin <linmiaohe@huawei.com>
71 Cc: Michael Larabel <Michael@MichaelLarabel.com>
72 Cc: Michal Hocko <mhocko@kernel.org>
73 Cc: Mike Rapoport <rppt@kernel.org>
74 Cc: Mike Rapoport <rppt@linux.ibm.com>
75 Cc: Peter Zijlstra <peterz@infradead.org>
76 Cc: Qi Zheng <zhengqi.arch@bytedance.com>
77 Cc: Tejun Heo <tj@kernel.org>
78 Cc: Vlastimil Babka <vbabka@suse.cz>
79 Cc: Will Deacon <will@kernel.org>
80 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
81 ---
82 include/linux/cgroup.h | 15 ++-
83 include/linux/mm_inline.h | 15 ++-
84 include/linux/mmzone.h | 9 ++
85 kernel/cgroup/cgroup-internal.h | 1 -
86 mm/Kconfig | 6 +
87 mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
88 6 files changed, 265 insertions(+), 9 deletions(-)
89
90 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
91 index 45cdb12243e3..f9a5d6a81101 100644
92 --- a/include/linux/cgroup.h
93 +++ b/include/linux/cgroup.h
94 @@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
95 css_put(&cgrp->self);
96 }
97
98 +extern struct mutex cgroup_mutex;
99 +
100 +static inline void cgroup_lock(void)
101 +{
102 + mutex_lock(&cgroup_mutex);
103 +}
104 +
105 +static inline void cgroup_unlock(void)
106 +{
107 + mutex_unlock(&cgroup_mutex);
108 +}
109 +
110 /**
111 * task_css_set_check - obtain a task's css_set with extra access conditions
112 * @task: the task to obtain css_set for
113 @@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
114 * as locks used during the cgroup_subsys::attach() methods.
115 */
116 #ifdef CONFIG_PROVE_RCU
117 -extern struct mutex cgroup_mutex;
118 extern spinlock_t css_set_lock;
119 #define task_css_set_check(task, __c) \
120 rcu_dereference_check((task)->cgroups, \
121 @@ -708,6 +719,8 @@ struct cgroup;
122 static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
123 static inline void css_get(struct cgroup_subsys_state *css) {}
124 static inline void css_put(struct cgroup_subsys_state *css) {}
125 +static inline void cgroup_lock(void) {}
126 +static inline void cgroup_unlock(void) {}
127 static inline int cgroup_attach_task_all(struct task_struct *from,
128 struct task_struct *t) { return 0; }
129 static inline int cgroupstats_build(struct cgroupstats *stats,
130 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
131 index 58aabb1ba020..e095c1c24311 100644
132 --- a/include/linux/mm_inline.h
133 +++ b/include/linux/mm_inline.h
134 @@ -91,10 +91,21 @@ static __always_inline enum lru_list page_lru(struct page *page)
135
136 #ifdef CONFIG_LRU_GEN
137
138 +#ifdef CONFIG_LRU_GEN_ENABLED
139 static inline bool lru_gen_enabled(void)
140 {
141 - return true;
142 + DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
143 +
144 + return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
145 +}
146 +#else
147 +static inline bool lru_gen_enabled(void)
148 +{
149 + DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
150 +
151 + return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
152 }
153 +#endif
154
155 static inline bool lru_gen_in_fault(void)
156 {
157 @@ -207,7 +218,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo
158
159 VM_WARN_ON_ONCE_PAGE(gen != -1, page);
160
161 - if (PageUnevictable(page))
162 + if (PageUnevictable(page) || !lrugen->enabled)
163 return false;
164 /*
165 * There are three common cases for this page:
166 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
167 index 659bab633bdf..edaf035503ed 100644
168 --- a/include/linux/mmzone.h
169 +++ b/include/linux/mmzone.h
170 @@ -364,6 +364,13 @@ enum {
171 LRU_GEN_FILE,
172 };
173
174 +enum {
175 + LRU_GEN_CORE,
176 + LRU_GEN_MM_WALK,
177 + LRU_GEN_NONLEAF_YOUNG,
178 + NR_LRU_GEN_CAPS
179 +};
180 +
181 #define MIN_LRU_BATCH BITS_PER_LONG
182 #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
183
184 @@ -405,6 +412,8 @@ struct lru_gen_struct {
185 /* can be modified without holding the LRU lock */
186 atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
187 atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
188 + /* whether the multi-gen LRU is enabled */
189 + bool enabled;
190 };
191
192 enum {
193 diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
194 index d8fcc139ac05..28c32a01da7d 100644
195 --- a/kernel/cgroup/cgroup-internal.h
196 +++ b/kernel/cgroup/cgroup-internal.h
197 @@ -165,7 +165,6 @@ struct cgroup_mgctx {
198 #define DEFINE_CGROUP_MGCTX(name) \
199 struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
200
201 -extern struct mutex cgroup_mutex;
202 extern spinlock_t css_set_lock;
203 extern struct cgroup_subsys *cgroup_subsys[];
204 extern struct list_head cgroup_roots;
205 diff --git a/mm/Kconfig b/mm/Kconfig
206 index 62433f3cd7ae..4a7d0af3c39b 100644
207 --- a/mm/Kconfig
208 +++ b/mm/Kconfig
209 @@ -906,6 +906,12 @@ config LRU_GEN
210 help
211 A high performance LRU implementation to overcommit memory.
212
213 +config LRU_GEN_ENABLED
214 + bool "Enable by default"
215 + depends on LRU_GEN
216 + help
217 + This option enables the multi-gen LRU by default.
218 +
219 config LRU_GEN_STATS
220 bool "Full stats for debugging"
221 depends on LRU_GEN
222 diff --git a/mm/vmscan.c b/mm/vmscan.c
223 index b6f6fc2585e1..be37d996bc92 100644
224 --- a/mm/vmscan.c
225 +++ b/mm/vmscan.c
226 @@ -52,6 +52,7 @@
227 #include <linux/psi.h>
228 #include <linux/pagewalk.h>
229 #include <linux/shmem_fs.h>
230 +#include <linux/ctype.h>
231
232 #include <asm/tlbflush.h>
233 #include <asm/div64.h>
234 @@ -2841,6 +2842,14 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
235
236 #ifdef CONFIG_LRU_GEN
237
238 +#ifdef CONFIG_LRU_GEN_ENABLED
239 +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
240 +#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
241 +#else
242 +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
243 +#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
244 +#endif
245 +
246 /******************************************************************************
247 * shorthand helpers
248 ******************************************************************************/
249 @@ -3717,7 +3726,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
250 goto next;
251
252 if (!pmd_trans_huge(pmd[i])) {
253 - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
254 + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
255 + get_cap(LRU_GEN_NONLEAF_YOUNG))
256 pmdp_test_and_clear_young(vma, addr, pmd + i);
257 goto next;
258 }
259 @@ -3815,10 +3825,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
260 walk->mm_stats[MM_NONLEAF_TOTAL]++;
261
262 #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
263 - if (!pmd_young(val))
264 - continue;
265 + if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
266 + if (!pmd_young(val))
267 + continue;
268
269 - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
270 + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
271 + }
272 #endif
273 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
274 continue;
275 @@ -4080,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
276 * handful of PTEs. Spreading the work out over a period of time usually
277 * is less efficient, but it avoids bursty page faults.
278 */
279 - if (!arch_has_hw_pte_young()) {
280 + if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
281 success = iterate_mm_list_nowalk(lruvec, max_seq);
282 goto done;
283 }
284 @@ -4845,6 +4857,208 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
285 blk_finish_plug(&plug);
286 }
287
288 +/******************************************************************************
289 + * state change
290 + ******************************************************************************/
291 +
292 +static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
293 +{
294 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
295 +
296 + if (lrugen->enabled) {
297 + enum lru_list lru;
298 +
299 + for_each_evictable_lru(lru) {
300 + if (!list_empty(&lruvec->lists[lru]))
301 + return false;
302 + }
303 + } else {
304 + int gen, type, zone;
305 +
306 + for_each_gen_type_zone(gen, type, zone) {
307 + if (!list_empty(&lrugen->lists[gen][type][zone]))
308 + return false;
309 + }
310 + }
311 +
312 + return true;
313 +}
314 +
315 +static bool fill_evictable(struct lruvec *lruvec)
316 +{
317 + enum lru_list lru;
318 + int remaining = MAX_LRU_BATCH;
319 +
320 + for_each_evictable_lru(lru) {
321 + int type = is_file_lru(lru);
322 + bool active = is_active_lru(lru);
323 + struct list_head *head = &lruvec->lists[lru];
324 +
325 + while (!list_empty(head)) {
326 + bool success;
327 + struct page *page = lru_to_page(head);
328 +
329 + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
330 + VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page);
331 + VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
332 + VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page);
333 +
334 + del_page_from_lru_list(page, lruvec);
335 + success = lru_gen_add_page(lruvec, page, false);
336 + VM_WARN_ON_ONCE(!success);
337 +
338 + if (!--remaining)
339 + return false;
340 + }
341 + }
342 +
343 + return true;
344 +}
345 +
346 +static bool drain_evictable(struct lruvec *lruvec)
347 +{
348 + int gen, type, zone;
349 + int remaining = MAX_LRU_BATCH;
350 +
351 + for_each_gen_type_zone(gen, type, zone) {
352 + struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
353 +
354 + while (!list_empty(head)) {
355 + bool success;
356 + struct page *page = lru_to_page(head);
357 +
358 + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
359 + VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
360 + VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
361 + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
362 +
363 + success = lru_gen_del_page(lruvec, page, false);
364 + VM_WARN_ON_ONCE(!success);
365 + add_page_to_lru_list(page, lruvec);
366 +
367 + if (!--remaining)
368 + return false;
369 + }
370 + }
371 +
372 + return true;
373 +}
374 +
375 +static void lru_gen_change_state(bool enabled)
376 +{
377 + static DEFINE_MUTEX(state_mutex);
378 +
379 + struct mem_cgroup *memcg;
380 +
381 + cgroup_lock();
382 + cpus_read_lock();
383 + get_online_mems();
384 + mutex_lock(&state_mutex);
385 +
386 + if (enabled == lru_gen_enabled())
387 + goto unlock;
388 +
389 + if (enabled)
390 + static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
391 + else
392 + static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
393 +
394 + memcg = mem_cgroup_iter(NULL, NULL, NULL);
395 + do {
396 + int nid;
397 +
398 + for_each_node(nid) {
399 + struct lruvec *lruvec = get_lruvec(memcg, nid);
400 +
401 + if (!lruvec)
402 + continue;
403 +
404 + spin_lock_irq(&lruvec->lru_lock);
405 +
406 + VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
407 + VM_WARN_ON_ONCE(!state_is_valid(lruvec));
408 +
409 + lruvec->lrugen.enabled = enabled;
410 +
411 + while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
412 + spin_unlock_irq(&lruvec->lru_lock);
413 + cond_resched();
414 + spin_lock_irq(&lruvec->lru_lock);
415 + }
416 +
417 + spin_unlock_irq(&lruvec->lru_lock);
418 + }
419 +
420 + cond_resched();
421 + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
422 +unlock:
423 + mutex_unlock(&state_mutex);
424 + put_online_mems();
425 + cpus_read_unlock();
426 + cgroup_unlock();
427 +}
428 +
429 +/******************************************************************************
430 + * sysfs interface
431 + ******************************************************************************/
432 +
433 +static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
434 +{
435 + unsigned int caps = 0;
436 +
437 + if (get_cap(LRU_GEN_CORE))
438 + caps |= BIT(LRU_GEN_CORE);
439 +
440 + if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
441 + caps |= BIT(LRU_GEN_MM_WALK);
442 +
443 + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
444 + caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
445 +
446 + return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
447 +}
448 +
449 +static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
450 + const char *buf, size_t len)
451 +{
452 + int i;
453 + unsigned int caps;
454 +
455 + if (tolower(*buf) == 'n')
456 + caps = 0;
457 + else if (tolower(*buf) == 'y')
458 + caps = -1;
459 + else if (kstrtouint(buf, 0, &caps))
460 + return -EINVAL;
461 +
462 + for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
463 + bool enabled = caps & BIT(i);
464 +
465 + if (i == LRU_GEN_CORE)
466 + lru_gen_change_state(enabled);
467 + else if (enabled)
468 + static_branch_enable(&lru_gen_caps[i]);
469 + else
470 + static_branch_disable(&lru_gen_caps[i]);
471 + }
472 +
473 + return len;
474 +}
475 +
476 +static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
477 + enabled, 0644, show_enabled, store_enabled
478 +);
479 +
480 +static struct attribute *lru_gen_attrs[] = {
481 + &lru_gen_enabled_attr.attr,
482 + NULL
483 +};
484 +
485 +static struct attribute_group lru_gen_attr_group = {
486 + .name = "lru_gen",
487 + .attrs = lru_gen_attrs,
488 +};
489 +
490 /******************************************************************************
491 * initialization
492 ******************************************************************************/
493 @@ -4855,6 +5069,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
494 struct lru_gen_struct *lrugen = &lruvec->lrugen;
495
496 lrugen->max_seq = MIN_NR_GENS + 1;
497 + lrugen->enabled = lru_gen_enabled();
498
499 for_each_gen_type_zone(gen, type, zone)
500 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
501 @@ -4894,6 +5109,9 @@ static int __init init_lru_gen(void)
502 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
503 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
504
505 + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
506 + pr_err("lru_gen: failed to create sysfs group\n");
507 +
508 return 0;
509 };
510 late_initcall(init_lru_gen);
511 --
512 2.40.0
513