gpio-nct5104d: fix compilation with kernel 6.6
[openwrt/openwrt.git] / target / linux / generic / backport-5.15 / 020-v6.3-26-mm-multi-gen-LRU-per-node-lru_gen_page-lists.patch
1 From fa6363828d314e837c5f79e97ea5e8c0d2f7f062 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:19:04 -0700
4 Subject: [PATCH 26/29] mm: multi-gen LRU: per-node lru_gen_page lists
5
6 For each node, memcgs are divided into two generations: the old and
7 the young. For each generation, memcgs are randomly sharded into
8 multiple bins to improve scalability. For each bin, an RCU hlist_nulls
9 is virtually divided into three segments: the head, the tail and the
10 default.
11
12 An onlining memcg is added to the tail of a random bin in the old
13 generation. The eviction starts at the head of a random bin in the old
14 generation. The per-node memcg generation counter, whose reminder (mod
15 2) indexes the old generation, is incremented when all its bins become
16 empty.
17
18 There are four operations:
19 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
20 its current generation (old or young) and updates its "seg" to
21 "head";
22 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
23 its current generation (old or young) and updates its "seg" to
24 "tail";
25 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
26 the old generation, updates its "gen" to "old" and resets its "seg"
27 to "default";
28 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
29 in the young generation, updates its "gen" to "young" and resets
30 its "seg" to "default".
31
32 The events that trigger the above operations are:
33 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
34 2. The first attempt to reclaim an memcg below low, which triggers
35 MEMCG_LRU_TAIL;
36 3. The first attempt to reclaim an memcg below reclaimable size
37 threshold, which triggers MEMCG_LRU_TAIL;
38 4. The second attempt to reclaim an memcg below reclaimable size
39 threshold, which triggers MEMCG_LRU_YOUNG;
40 5. Attempting to reclaim an memcg below min, which triggers
41 MEMCG_LRU_YOUNG;
42 6. Finishing the aging on the eviction path, which triggers
43 MEMCG_LRU_YOUNG;
44 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
45
46 Note that memcg LRU only applies to global reclaim, and the
47 round-robin incrementing of their max_seq counters ensures the
48 eventual fairness to all eligible memcgs. For memcg reclaim, it still
49 relies on mem_cgroup_iter().
50
51 Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
52 Signed-off-by: Yu Zhao <yuzhao@google.com>
53 Cc: Johannes Weiner <hannes@cmpxchg.org>
54 Cc: Jonathan Corbet <corbet@lwn.net>
55 Cc: Michael Larabel <Michael@MichaelLarabel.com>
56 Cc: Michal Hocko <mhocko@kernel.org>
57 Cc: Mike Rapoport <rppt@kernel.org>
58 Cc: Roman Gushchin <roman.gushchin@linux.dev>
59 Cc: Suren Baghdasaryan <surenb@google.com>
60 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
61 ---
62 include/linux/memcontrol.h | 10 +
63 include/linux/mm_inline.h | 17 ++
64 include/linux/mmzone.h | 117 +++++++++++-
65 mm/memcontrol.c | 16 ++
66 mm/page_alloc.c | 1 +
67 mm/vmscan.c | 373 +++++++++++++++++++++++++++++++++----
68 6 files changed, 499 insertions(+), 35 deletions(-)
69
70 --- a/include/linux/memcontrol.h
71 +++ b/include/linux/memcontrol.h
72 @@ -823,6 +823,11 @@ static inline void obj_cgroup_put(struct
73 percpu_ref_put(&objcg->refcnt);
74 }
75
76 +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
77 +{
78 + return !memcg || css_tryget(&memcg->css);
79 +}
80 +
81 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
82 {
83 if (memcg)
84 @@ -1288,6 +1293,11 @@ struct mem_cgroup *mem_cgroup_from_css(s
85 return NULL;
86 }
87
88 +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
89 +{
90 + return true;
91 +}
92 +
93 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
94 {
95 }
96 --- a/include/linux/mm_inline.h
97 +++ b/include/linux/mm_inline.h
98 @@ -112,6 +112,18 @@ static inline bool lru_gen_in_fault(void
99 return current->in_lru_fault;
100 }
101
102 +#ifdef CONFIG_MEMCG
103 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
104 +{
105 + return READ_ONCE(lruvec->lrugen.seg);
106 +}
107 +#else
108 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
109 +{
110 + return 0;
111 +}
112 +#endif
113 +
114 static inline int lru_gen_from_seq(unsigned long seq)
115 {
116 return seq % MAX_NR_GENS;
117 @@ -287,6 +299,11 @@ static inline bool lru_gen_in_fault(void
118 return false;
119 }
120
121 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
122 +{
123 + return 0;
124 +}
125 +
126 static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
127 {
128 return false;
129 --- a/include/linux/mmzone.h
130 +++ b/include/linux/mmzone.h
131 @@ -7,6 +7,7 @@
132
133 #include <linux/spinlock.h>
134 #include <linux/list.h>
135 +#include <linux/list_nulls.h>
136 #include <linux/wait.h>
137 #include <linux/bitops.h>
138 #include <linux/cache.h>
139 @@ -357,6 +358,15 @@ struct page_vma_mapped_walk;
140 #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
141 #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
142
143 +/* see the comment on MEMCG_NR_GENS */
144 +enum {
145 + MEMCG_LRU_NOP,
146 + MEMCG_LRU_HEAD,
147 + MEMCG_LRU_TAIL,
148 + MEMCG_LRU_OLD,
149 + MEMCG_LRU_YOUNG,
150 +};
151 +
152 #ifdef CONFIG_LRU_GEN
153
154 enum {
155 @@ -416,6 +426,14 @@ struct lru_gen_page {
156 atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
157 /* whether the multi-gen LRU is enabled */
158 bool enabled;
159 +#ifdef CONFIG_MEMCG
160 + /* the memcg generation this lru_gen_page belongs to */
161 + u8 gen;
162 + /* the list segment this lru_gen_page belongs to */
163 + u8 seg;
164 + /* per-node lru_gen_page list for global reclaim */
165 + struct hlist_nulls_node list;
166 +#endif
167 };
168
169 enum {
170 @@ -469,12 +487,87 @@ void lru_gen_init_lruvec(struct lruvec *
171 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
172
173 #ifdef CONFIG_MEMCG
174 +
175 +/*
176 + * For each node, memcgs are divided into two generations: the old and the
177 + * young. For each generation, memcgs are randomly sharded into multiple bins
178 + * to improve scalability. For each bin, the hlist_nulls is virtually divided
179 + * into three segments: the head, the tail and the default.
180 + *
181 + * An onlining memcg is added to the tail of a random bin in the old generation.
182 + * The eviction starts at the head of a random bin in the old generation. The
183 + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
184 + * the old generation, is incremented when all its bins become empty.
185 + *
186 + * There are four operations:
187 + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
188 + * current generation (old or young) and updates its "seg" to "head";
189 + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
190 + * current generation (old or young) and updates its "seg" to "tail";
191 + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
192 + * generation, updates its "gen" to "old" and resets its "seg" to "default";
193 + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
194 + * young generation, updates its "gen" to "young" and resets its "seg" to
195 + * "default".
196 + *
197 + * The events that trigger the above operations are:
198 + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
199 + * 2. The first attempt to reclaim an memcg below low, which triggers
200 + * MEMCG_LRU_TAIL;
201 + * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
202 + * which triggers MEMCG_LRU_TAIL;
203 + * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
204 + * which triggers MEMCG_LRU_YOUNG;
205 + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
206 + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
207 + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
208 + *
209 + * Note that memcg LRU only applies to global reclaim, and the round-robin
210 + * incrementing of their max_seq counters ensures the eventual fairness to all
211 + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
212 + */
213 +#define MEMCG_NR_GENS 2
214 +#define MEMCG_NR_BINS 8
215 +
216 +struct lru_gen_memcg {
217 + /* the per-node memcg generation counter */
218 + unsigned long seq;
219 + /* each memcg has one lru_gen_page per node */
220 + unsigned long nr_memcgs[MEMCG_NR_GENS];
221 + /* per-node lru_gen_page list for global reclaim */
222 + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
223 + /* protects the above */
224 + spinlock_t lock;
225 +};
226 +
227 +void lru_gen_init_pgdat(struct pglist_data *pgdat);
228 +
229 void lru_gen_init_memcg(struct mem_cgroup *memcg);
230 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
231 -#endif
232 +void lru_gen_online_memcg(struct mem_cgroup *memcg);
233 +void lru_gen_offline_memcg(struct mem_cgroup *memcg);
234 +void lru_gen_release_memcg(struct mem_cgroup *memcg);
235 +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
236 +
237 +#else /* !CONFIG_MEMCG */
238 +
239 +#define MEMCG_NR_GENS 1
240 +
241 +struct lru_gen_memcg {
242 +};
243 +
244 +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
245 +{
246 +}
247 +
248 +#endif /* CONFIG_MEMCG */
249
250 #else /* !CONFIG_LRU_GEN */
251
252 +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
253 +{
254 +}
255 +
256 static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
257 {
258 }
259 @@ -484,6 +577,7 @@ static inline void lru_gen_look_around(s
260 }
261
262 #ifdef CONFIG_MEMCG
263 +
264 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
265 {
266 }
267 @@ -491,7 +585,24 @@ static inline void lru_gen_init_memcg(st
268 static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
269 {
270 }
271 -#endif
272 +
273 +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
274 +{
275 +}
276 +
277 +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
278 +{
279 +}
280 +
281 +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
282 +{
283 +}
284 +
285 +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
286 +{
287 +}
288 +
289 +#endif /* CONFIG_MEMCG */
290
291 #endif /* CONFIG_LRU_GEN */
292
293 @@ -1105,6 +1216,8 @@ typedef struct pglist_data {
294 #ifdef CONFIG_LRU_GEN
295 /* kswap mm walk data */
296 struct lru_gen_mm_walk mm_walk;
297 + /* lru_gen_page list */
298 + struct lru_gen_memcg memcg_lru;
299 #endif
300
301 ZONE_PADDING(_pad2_)
302 --- a/mm/memcontrol.c
303 +++ b/mm/memcontrol.c
304 @@ -549,6 +549,16 @@ static void mem_cgroup_update_tree(struc
305 struct mem_cgroup_per_node *mz;
306 struct mem_cgroup_tree_per_node *mctz;
307
308 + if (lru_gen_enabled()) {
309 + struct lruvec *lruvec = &mem_cgroup_page_nodeinfo(memcg, page)->lruvec;
310 +
311 + /* see the comment on MEMCG_NR_GENS */
312 + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
313 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
314 +
315 + return;
316 + }
317 +
318 mctz = soft_limit_tree_from_page(page);
319 if (!mctz)
320 return;
321 @@ -3434,6 +3444,9 @@ unsigned long mem_cgroup_soft_limit_recl
322 unsigned long excess;
323 unsigned long nr_scanned;
324
325 + if (lru_gen_enabled())
326 + return 0;
327 +
328 if (order > 0)
329 return 0;
330
331 @@ -5322,6 +5335,7 @@ static int mem_cgroup_css_online(struct
332 if (unlikely(mem_cgroup_is_root(memcg)))
333 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
334 2UL*HZ);
335 + lru_gen_online_memcg(memcg);
336 return 0;
337 }
338
339 @@ -5348,6 +5362,7 @@ static void mem_cgroup_css_offline(struc
340 memcg_offline_kmem(memcg);
341 reparent_shrinker_deferred(memcg);
342 wb_memcg_offline(memcg);
343 + lru_gen_offline_memcg(memcg);
344
345 drain_all_stock(memcg);
346
347 @@ -5359,6 +5374,7 @@ static void mem_cgroup_css_released(stru
348 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
349
350 invalidate_reclaim_iterators(memcg);
351 + lru_gen_release_memcg(memcg);
352 }
353
354 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
355 --- a/mm/page_alloc.c
356 +++ b/mm/page_alloc.c
357 @@ -7661,6 +7661,7 @@ static void __init free_area_init_node(i
358 pgdat_set_deferred_range(pgdat);
359
360 free_area_init_core(pgdat);
361 + lru_gen_init_pgdat(pgdat);
362 }
363
364 void __init free_area_init_memoryless_node(int nid)
365 --- a/mm/vmscan.c
366 +++ b/mm/vmscan.c
367 @@ -54,6 +54,8 @@
368 #include <linux/shmem_fs.h>
369 #include <linux/ctype.h>
370 #include <linux/debugfs.h>
371 +#include <linux/rculist_nulls.h>
372 +#include <linux/random.h>
373
374 #include <asm/tlbflush.h>
375 #include <asm/div64.h>
376 @@ -129,11 +131,6 @@ struct scan_control {
377 /* Always discard instead of demoting to lower tier memory */
378 unsigned int no_demotion:1;
379
380 -#ifdef CONFIG_LRU_GEN
381 - /* help kswapd make better choices among multiple memcgs */
382 - unsigned long last_reclaimed;
383 -#endif
384 -
385 /* Allocation order */
386 s8 order;
387
388 @@ -2880,6 +2877,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_ca
389 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
390 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
391
392 +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
393 +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
394 +
395 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
396 {
397 struct pglist_data *pgdat = NODE_DATA(nid);
398 @@ -4169,8 +4169,7 @@ done:
399 if (sc->priority <= DEF_PRIORITY - 2)
400 wait_event_killable(lruvec->mm_state.wait,
401 max_seq < READ_ONCE(lrugen->max_seq));
402 -
403 - return max_seq < READ_ONCE(lrugen->max_seq);
404 + return false;
405 }
406
407 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
408 @@ -4243,8 +4242,6 @@ static void lru_gen_age_node(struct pgli
409
410 VM_WARN_ON_ONCE(!current_is_kswapd());
411
412 - sc->last_reclaimed = sc->nr_reclaimed;
413 -
414 /* check the order to exclude compaction-induced reclaim */
415 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
416 return;
417 @@ -4833,8 +4830,7 @@ static bool should_run_aging(struct lruv
418 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
419 * reclaim.
420 */
421 -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
422 - bool can_swap)
423 +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
424 {
425 unsigned long nr_to_scan;
426 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
427 @@ -4851,10 +4847,8 @@ static unsigned long get_nr_to_scan(stru
428 if (sc->priority == DEF_PRIORITY)
429 return nr_to_scan;
430
431 - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
432 -
433 /* skip this lruvec as it's low on cold pages */
434 - return 0;
435 + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
436 }
437
438 static unsigned long get_nr_to_reclaim(struct scan_control *sc)
439 @@ -4863,29 +4857,18 @@ static unsigned long get_nr_to_reclaim(s
440 if (!global_reclaim(sc))
441 return -1;
442
443 - /* discount the previous progress for kswapd */
444 - if (current_is_kswapd())
445 - return sc->nr_to_reclaim + sc->last_reclaimed;
446 -
447 return max(sc->nr_to_reclaim, compact_gap(sc->order));
448 }
449
450 -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
451 +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
452 {
453 - struct blk_plug plug;
454 + long nr_to_scan;
455 unsigned long scanned = 0;
456 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
457
458 - lru_add_drain();
459 -
460 - blk_start_plug(&plug);
461 -
462 - set_mm_walk(lruvec_pgdat(lruvec));
463 -
464 while (true) {
465 int delta;
466 int swappiness;
467 - unsigned long nr_to_scan;
468
469 if (sc->may_swap)
470 swappiness = get_swappiness(lruvec, sc);
471 @@ -4895,7 +4878,7 @@ static void lru_gen_shrink_lruvec(struct
472 swappiness = 0;
473
474 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
475 - if (!nr_to_scan)
476 + if (nr_to_scan <= 0)
477 break;
478
479 delta = evict_pages(lruvec, sc, swappiness);
480 @@ -4912,10 +4895,250 @@ static void lru_gen_shrink_lruvec(struct
481 cond_resched();
482 }
483
484 + /* whether try_to_inc_max_seq() was successful */
485 + return nr_to_scan < 0;
486 +}
487 +
488 +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
489 +{
490 + bool success;
491 + unsigned long scanned = sc->nr_scanned;
492 + unsigned long reclaimed = sc->nr_reclaimed;
493 + int seg = lru_gen_memcg_seg(lruvec);
494 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
495 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
496 +
497 + /* see the comment on MEMCG_NR_GENS */
498 + if (!lruvec_is_sizable(lruvec, sc))
499 + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
500 +
501 + mem_cgroup_calculate_protection(NULL, memcg);
502 +
503 + if (mem_cgroup_below_min(memcg))
504 + return MEMCG_LRU_YOUNG;
505 +
506 + if (mem_cgroup_below_low(memcg)) {
507 + /* see the comment on MEMCG_NR_GENS */
508 + if (seg != MEMCG_LRU_TAIL)
509 + return MEMCG_LRU_TAIL;
510 +
511 + memcg_memory_event(memcg, MEMCG_LOW);
512 + }
513 +
514 + success = try_to_shrink_lruvec(lruvec, sc);
515 +
516 + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
517 +
518 + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
519 + sc->nr_reclaimed - reclaimed);
520 +
521 + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
522 + current->reclaim_state->reclaimed_slab = 0;
523 +
524 + return success ? MEMCG_LRU_YOUNG : 0;
525 +}
526 +
527 +#ifdef CONFIG_MEMCG
528 +
529 +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
530 +{
531 + int gen;
532 + int bin;
533 + int first_bin;
534 + struct lruvec *lruvec;
535 + struct lru_gen_page *lrugen;
536 + const struct hlist_nulls_node *pos;
537 + int op = 0;
538 + struct mem_cgroup *memcg = NULL;
539 + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
540 +
541 + bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
542 +restart:
543 + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
544 +
545 + rcu_read_lock();
546 +
547 + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
548 + if (op)
549 + lru_gen_rotate_memcg(lruvec, op);
550 +
551 + mem_cgroup_put(memcg);
552 +
553 + lruvec = container_of(lrugen, struct lruvec, lrugen);
554 + memcg = lruvec_memcg(lruvec);
555 +
556 + if (!mem_cgroup_tryget(memcg)) {
557 + op = 0;
558 + memcg = NULL;
559 + continue;
560 + }
561 +
562 + rcu_read_unlock();
563 +
564 + op = shrink_one(lruvec, sc);
565 +
566 + if (sc->nr_reclaimed >= nr_to_reclaim)
567 + goto success;
568 +
569 + rcu_read_lock();
570 + }
571 +
572 + rcu_read_unlock();
573 +
574 + /* restart if raced with lru_gen_rotate_memcg() */
575 + if (gen != get_nulls_value(pos))
576 + goto restart;
577 +
578 + /* try the rest of the bins of the current generation */
579 + bin = get_memcg_bin(bin + 1);
580 + if (bin != first_bin)
581 + goto restart;
582 +success:
583 + if (op)
584 + lru_gen_rotate_memcg(lruvec, op);
585 +
586 + mem_cgroup_put(memcg);
587 +}
588 +
589 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
590 +{
591 + struct blk_plug plug;
592 +
593 + VM_WARN_ON_ONCE(global_reclaim(sc));
594 +
595 + lru_add_drain();
596 +
597 + blk_start_plug(&plug);
598 +
599 + set_mm_walk(lruvec_pgdat(lruvec));
600 +
601 + if (try_to_shrink_lruvec(lruvec, sc))
602 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
603 +
604 + clear_mm_walk();
605 +
606 + blk_finish_plug(&plug);
607 +}
608 +
609 +#else /* !CONFIG_MEMCG */
610 +
611 +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
612 +{
613 + BUILD_BUG();
614 +}
615 +
616 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
617 +{
618 + BUILD_BUG();
619 +}
620 +
621 +#endif
622 +
623 +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
624 +{
625 + int priority;
626 + unsigned long reclaimable;
627 + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
628 +
629 + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
630 + return;
631 + /*
632 + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
633 + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
634 + * estimated reclaimed_to_scanned_ratio = inactive / total.
635 + */
636 + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
637 + if (get_swappiness(lruvec, sc))
638 + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
639 +
640 + reclaimable /= MEMCG_NR_GENS;
641 +
642 + /* round down reclaimable and round up sc->nr_to_reclaim */
643 + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
644 +
645 + sc->priority = clamp(priority, 0, DEF_PRIORITY);
646 +}
647 +
648 +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
649 +{
650 + struct blk_plug plug;
651 + unsigned long reclaimed = sc->nr_reclaimed;
652 +
653 + VM_WARN_ON_ONCE(!global_reclaim(sc));
654 +
655 + lru_add_drain();
656 +
657 + blk_start_plug(&plug);
658 +
659 + set_mm_walk(pgdat);
660 +
661 + set_initial_priority(pgdat, sc);
662 +
663 + if (current_is_kswapd())
664 + sc->nr_reclaimed = 0;
665 +
666 + if (mem_cgroup_disabled())
667 + shrink_one(&pgdat->__lruvec, sc);
668 + else
669 + shrink_many(pgdat, sc);
670 +
671 + if (current_is_kswapd())
672 + sc->nr_reclaimed += reclaimed;
673 +
674 clear_mm_walk();
675
676 blk_finish_plug(&plug);
677 +
678 + /* kswapd should never fail */
679 + pgdat->kswapd_failures = 0;
680 +}
681 +
682 +#ifdef CONFIG_MEMCG
683 +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
684 +{
685 + int seg;
686 + int old, new;
687 + int bin = prandom_u32_max(MEMCG_NR_BINS);
688 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
689 +
690 + spin_lock(&pgdat->memcg_lru.lock);
691 +
692 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
693 +
694 + seg = 0;
695 + new = old = lruvec->lrugen.gen;
696 +
697 + /* see the comment on MEMCG_NR_GENS */
698 + if (op == MEMCG_LRU_HEAD)
699 + seg = MEMCG_LRU_HEAD;
700 + else if (op == MEMCG_LRU_TAIL)
701 + seg = MEMCG_LRU_TAIL;
702 + else if (op == MEMCG_LRU_OLD)
703 + new = get_memcg_gen(pgdat->memcg_lru.seq);
704 + else if (op == MEMCG_LRU_YOUNG)
705 + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
706 + else
707 + VM_WARN_ON_ONCE(true);
708 +
709 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
710 +
711 + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
712 + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
713 + else
714 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
715 +
716 + pgdat->memcg_lru.nr_memcgs[old]--;
717 + pgdat->memcg_lru.nr_memcgs[new]++;
718 +
719 + lruvec->lrugen.gen = new;
720 + WRITE_ONCE(lruvec->lrugen.seg, seg);
721 +
722 + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
723 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
724 +
725 + spin_unlock(&pgdat->memcg_lru.lock);
726 }
727 +#endif
728
729 /******************************************************************************
730 * state change
731 @@ -5370,11 +5593,11 @@ static int run_cmd(char cmd, int memcg_i
732
733 if (!mem_cgroup_disabled()) {
734 rcu_read_lock();
735 +
736 memcg = mem_cgroup_from_id(memcg_id);
737 -#ifdef CONFIG_MEMCG
738 - if (memcg && !css_tryget(&memcg->css))
739 + if (!mem_cgroup_tryget(memcg))
740 memcg = NULL;
741 -#endif
742 +
743 rcu_read_unlock();
744
745 if (!memcg)
746 @@ -5521,6 +5744,19 @@ void lru_gen_init_lruvec(struct lruvec *
747 }
748
749 #ifdef CONFIG_MEMCG
750 +
751 +void lru_gen_init_pgdat(struct pglist_data *pgdat)
752 +{
753 + int i, j;
754 +
755 + spin_lock_init(&pgdat->memcg_lru.lock);
756 +
757 + for (i = 0; i < MEMCG_NR_GENS; i++) {
758 + for (j = 0; j < MEMCG_NR_BINS; j++)
759 + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
760 + }
761 +}
762 +
763 void lru_gen_init_memcg(struct mem_cgroup *memcg)
764 {
765 INIT_LIST_HEAD(&memcg->mm_list.fifo);
766 @@ -5544,7 +5780,69 @@ void lru_gen_exit_memcg(struct mem_cgrou
767 }
768 }
769 }
770 -#endif
771 +
772 +void lru_gen_online_memcg(struct mem_cgroup *memcg)
773 +{
774 + int gen;
775 + int nid;
776 + int bin = prandom_u32_max(MEMCG_NR_BINS);
777 +
778 + for_each_node(nid) {
779 + struct pglist_data *pgdat = NODE_DATA(nid);
780 + struct lruvec *lruvec = get_lruvec(memcg, nid);
781 +
782 + spin_lock(&pgdat->memcg_lru.lock);
783 +
784 + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
785 +
786 + gen = get_memcg_gen(pgdat->memcg_lru.seq);
787 +
788 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
789 + pgdat->memcg_lru.nr_memcgs[gen]++;
790 +
791 + lruvec->lrugen.gen = gen;
792 +
793 + spin_unlock(&pgdat->memcg_lru.lock);
794 + }
795 +}
796 +
797 +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
798 +{
799 + int nid;
800 +
801 + for_each_node(nid) {
802 + struct lruvec *lruvec = get_lruvec(memcg, nid);
803 +
804 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
805 + }
806 +}
807 +
808 +void lru_gen_release_memcg(struct mem_cgroup *memcg)
809 +{
810 + int gen;
811 + int nid;
812 +
813 + for_each_node(nid) {
814 + struct pglist_data *pgdat = NODE_DATA(nid);
815 + struct lruvec *lruvec = get_lruvec(memcg, nid);
816 +
817 + spin_lock(&pgdat->memcg_lru.lock);
818 +
819 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
820 +
821 + gen = lruvec->lrugen.gen;
822 +
823 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
824 + pgdat->memcg_lru.nr_memcgs[gen]--;
825 +
826 + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
827 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
828 +
829 + spin_unlock(&pgdat->memcg_lru.lock);
830 + }
831 +}
832 +
833 +#endif /* CONFIG_MEMCG */
834
835 static int __init init_lru_gen(void)
836 {
837 @@ -5571,6 +5869,10 @@ static void lru_gen_shrink_lruvec(struct
838 {
839 }
840
841 +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
842 +{
843 +}
844 +
845 #endif /* CONFIG_LRU_GEN */
846
847 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
848 @@ -5584,7 +5886,7 @@ static void shrink_lruvec(struct lruvec
849 bool proportional_reclaim;
850 struct blk_plug plug;
851
852 - if (lru_gen_enabled()) {
853 + if (lru_gen_enabled() && !global_reclaim(sc)) {
854 lru_gen_shrink_lruvec(lruvec, sc);
855 return;
856 }
857 @@ -5826,6 +6128,11 @@ static void shrink_node(pg_data_t *pgdat
858 struct lruvec *target_lruvec;
859 bool reclaimable = false;
860
861 + if (lru_gen_enabled() && global_reclaim(sc)) {
862 + lru_gen_shrink_node(pgdat, sc);
863 + return;
864 + }
865 +
866 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
867
868 again: