kernel: bump 5.15 to 5.15.155
[openwrt/staging/stintel.git] / target / linux / generic / backport-5.15 / 020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch
1 From b564b9471cd60ef1ee3961a224898ce4a9620d84 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Sun, 18 Sep 2022 02:00:03 -0600
4 Subject: [PATCH 06/29] mm: multi-gen LRU: minimal implementation
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 To avoid confusion, the terms "promotion" and "demotion" will be applied
10 to the multi-gen LRU, as a new convention; the terms "activation" and
11 "deactivation" will be applied to the active/inactive LRU, as usual.
12
13 The aging produces young generations. Given an lruvec, it increments
14 max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS. The aging promotes
15 hot pages to the youngest generation when it finds them accessed through
16 page tables; the demotion of cold pages happens consequently when it
17 increments max_seq. Promotion in the aging path does not involve any LRU
18 list operations, only the updates of the gen counter and
19 lrugen->nr_pages[]; demotion, unless as the result of the increment of
20 max_seq, requires LRU list operations, e.g., lru_deactivate_fn(). The
21 aging has the complexity O(nr_hot_pages), since it is only interested in
22 hot pages.
23
24 The eviction consumes old generations. Given an lruvec, it increments
25 min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes empty.
26 A feedback loop modeled after the PID controller monitors refaults over
27 anon and file types and decides which type to evict when both types are
28 available from the same generation.
29
30 The protection of pages accessed multiple times through file descriptors
31 takes place in the eviction path. Each generation is divided into
32 multiple tiers. A page accessed N times through file descriptors is in
33 tier order_base_2(N). Tiers do not have dedicated lrugen->lists[], only
34 bits in page->flags. The aforementioned feedback loop also monitors
35 refaults over all tiers and decides when to protect pages in which tiers
36 (N>1), using the first tier (N=0,1) as a baseline. The first tier
37 contains single-use unmapped clean pages, which are most likely the best
38 choices. In contrast to promotion in the aging path, the protection of a
39 page in the eviction path is achieved by moving this page to the next
40 generation, i.e., min_seq+1, if the feedback loop decides so. This
41 approach has the following advantages:
42
43 1. It removes the cost of activation in the buffered access path by
44 inferring whether pages accessed multiple times through file
45 descriptors are statistically hot and thus worth protecting in the
46 eviction path.
47 2. It takes pages accessed through page tables into account and avoids
48 overprotecting pages accessed multiple times through file
49 descriptors. (Pages accessed through page tables are in the first
50 tier, since N=0.)
51 3. More tiers provide better protection for pages accessed more than
52 twice through file descriptors, when under heavy buffered I/O
53 workloads.
54
55 Server benchmark results:
56 Single workload:
57 fio (buffered I/O): +[30, 32]%
58 IOPS BW
59 5.19-rc1: 2673k 10.2GiB/s
60 patch1-6: 3491k 13.3GiB/s
61
62 Single workload:
63 memcached (anon): -[4, 6]%
64 Ops/sec KB/sec
65 5.19-rc1: 1161501.04 45177.25
66 patch1-6: 1106168.46 43025.04
67
68 Configurations:
69 CPU: two Xeon 6154
70 Mem: total 256G
71
72 Node 1 was only used as a ram disk to reduce the variance in the
73 results.
74
75 patch drivers/block/brd.c <<EOF
76 99,100c99,100
77 < gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
78 < page = alloc_page(gfp_flags);
79 ---
80 > gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE;
81 > page = alloc_pages_node(1, gfp_flags, 0);
82 EOF
83
84 cat >>/etc/systemd/system.conf <<EOF
85 CPUAffinity=numa
86 NUMAPolicy=bind
87 NUMAMask=0
88 EOF
89
90 cat >>/etc/memcached.conf <<EOF
91 -m 184320
92 -s /var/run/memcached/memcached.sock
93 -a 0766
94 -t 36
95 -B binary
96 EOF
97
98 cat fio.sh
99 modprobe brd rd_nr=1 rd_size=113246208
100 swapoff -a
101 mkfs.ext4 /dev/ram0
102 mount -t ext4 /dev/ram0 /mnt
103
104 mkdir /sys/fs/cgroup/user.slice/test
105 echo 38654705664 >/sys/fs/cgroup/user.slice/test/memory.max
106 echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs
107 fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \
108 --buffered=1 --ioengine=io_uring --iodepth=128 \
109 --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
110 --rw=randread --random_distribution=random --norandommap \
111 --time_based --ramp_time=10m --runtime=5m --group_reporting
112
113 cat memcached.sh
114 modprobe brd rd_nr=1 rd_size=113246208
115 swapoff -a
116 mkswap /dev/ram0
117 swapon /dev/ram0
118
119 memtier_benchmark -S /var/run/memcached/memcached.sock \
120 -P memcache_binary -n allkeys --key-minimum=1 \
121 --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \
122 --ratio 1:0 --pipeline 8 -d 2000
123
124 memtier_benchmark -S /var/run/memcached/memcached.sock \
125 -P memcache_binary -n allkeys --key-minimum=1 \
126 --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \
127 --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
128
129 Client benchmark results:
130 kswapd profiles:
131 5.19-rc1
132 40.33% page_vma_mapped_walk (overhead)
133 21.80% lzo1x_1_do_compress (real work)
134 7.53% do_raw_spin_lock
135 3.95% _raw_spin_unlock_irq
136 2.52% vma_interval_tree_iter_next
137 2.37% page_referenced_one
138 2.28% vma_interval_tree_subtree_search
139 1.97% anon_vma_interval_tree_iter_first
140 1.60% ptep_clear_flush
141 1.06% __zram_bvec_write
142
143 patch1-6
144 39.03% lzo1x_1_do_compress (real work)
145 18.47% page_vma_mapped_walk (overhead)
146 6.74% _raw_spin_unlock_irq
147 3.97% do_raw_spin_lock
148 2.49% ptep_clear_flush
149 2.48% anon_vma_interval_tree_iter_first
150 1.92% page_referenced_one
151 1.88% __zram_bvec_write
152 1.48% memmove
153 1.31% vma_interval_tree_iter_next
154
155 Configurations:
156 CPU: single Snapdragon 7c
157 Mem: total 4G
158
159 ChromeOS MemoryPressure [1]
160
161 [1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/
162
163 Link: https://lkml.kernel.org/r/20220918080010.2920238-7-yuzhao@google.com
164 Signed-off-by: Yu Zhao <yuzhao@google.com>
165 Acked-by: Brian Geffon <bgeffon@google.com>
166 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
167 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
168 Acked-by: Steven Barrett <steven@liquorix.net>
169 Acked-by: Suleiman Souhlal <suleiman@google.com>
170 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
171 Tested-by: Donald Carr <d@chaos-reins.com>
172 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
173 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
174 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
175 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
176 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
177 Cc: Andi Kleen <ak@linux.intel.com>
178 Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
179 Cc: Barry Song <baohua@kernel.org>
180 Cc: Catalin Marinas <catalin.marinas@arm.com>
181 Cc: Dave Hansen <dave.hansen@linux.intel.com>
182 Cc: Hillf Danton <hdanton@sina.com>
183 Cc: Jens Axboe <axboe@kernel.dk>
184 Cc: Johannes Weiner <hannes@cmpxchg.org>
185 Cc: Jonathan Corbet <corbet@lwn.net>
186 Cc: Linus Torvalds <torvalds@linux-foundation.org>
187 Cc: Matthew Wilcox <willy@infradead.org>
188 Cc: Mel Gorman <mgorman@suse.de>
189 Cc: Miaohe Lin <linmiaohe@huawei.com>
190 Cc: Michael Larabel <Michael@MichaelLarabel.com>
191 Cc: Michal Hocko <mhocko@kernel.org>
192 Cc: Mike Rapoport <rppt@kernel.org>
193 Cc: Mike Rapoport <rppt@linux.ibm.com>
194 Cc: Peter Zijlstra <peterz@infradead.org>
195 Cc: Qi Zheng <zhengqi.arch@bytedance.com>
196 Cc: Tejun Heo <tj@kernel.org>
197 Cc: Vlastimil Babka <vbabka@suse.cz>
198 Cc: Will Deacon <will@kernel.org>
199 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
200 ---
201 include/linux/mm_inline.h | 36 ++
202 include/linux/mmzone.h | 41 ++
203 include/linux/page-flags-layout.h | 5 +-
204 kernel/bounds.c | 2 +
205 mm/Kconfig | 11 +
206 mm/swap.c | 39 ++
207 mm/vmscan.c | 792 +++++++++++++++++++++++++++++-
208 mm/workingset.c | 110 ++++-
209 8 files changed, 1025 insertions(+), 11 deletions(-)
210
211 --- a/include/linux/mm_inline.h
212 +++ b/include/linux/mm_inline.h
213 @@ -106,6 +106,33 @@ static inline int lru_gen_from_seq(unsig
214 return seq % MAX_NR_GENS;
215 }
216
217 +static inline int lru_hist_from_seq(unsigned long seq)
218 +{
219 + return seq % NR_HIST_GENS;
220 +}
221 +
222 +static inline int lru_tier_from_refs(int refs)
223 +{
224 + VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
225 +
226 + /* see the comment in page_lru_refs() */
227 + return order_base_2(refs + 1);
228 +}
229 +
230 +static inline int page_lru_refs(struct page *page)
231 +{
232 + unsigned long flags = READ_ONCE(page->flags);
233 + bool workingset = flags & BIT(PG_workingset);
234 +
235 + /*
236 + * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
237 + * total number of accesses is N>1, since N=0,1 both map to the first
238 + * tier. lru_tier_from_refs() will account for this off-by-one. Also see
239 + * the comment on MAX_NR_TIERS.
240 + */
241 + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
242 +}
243 +
244 static inline int page_lru_gen(struct page *page)
245 {
246 unsigned long flags = READ_ONCE(page->flags);
247 @@ -158,6 +185,15 @@ static inline void lru_gen_update_size(s
248 __update_lru_size(lruvec, lru, zone, -delta);
249 return;
250 }
251 +
252 + /* promotion */
253 + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
254 + __update_lru_size(lruvec, lru, zone, -delta);
255 + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
256 + }
257 +
258 + /* demotion requires isolation, e.g., lru_deactivate_fn() */
259 + VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
260 }
261
262 static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
263 --- a/include/linux/mmzone.h
264 +++ b/include/linux/mmzone.h
265 @@ -327,6 +327,28 @@ enum lruvec_flags {
266 #define MIN_NR_GENS 2U
267 #define MAX_NR_GENS 4U
268
269 +/*
270 + * Each generation is divided into multiple tiers. A page accessed N times
271 + * through file descriptors is in tier order_base_2(N). A page in the first tier
272 + * (N=0,1) is marked by PG_referenced unless it was faulted in through page
273 + * tables or read ahead. A page in any other tier (N>1) is marked by
274 + * PG_referenced and PG_workingset. This implies a minimum of two tiers is
275 + * supported without using additional bits in page->flags.
276 + *
277 + * In contrast to moving across generations which requires the LRU lock, moving
278 + * across tiers only involves atomic operations on page->flags and therefore
279 + * has a negligible cost in the buffered access path. In the eviction path,
280 + * comparisons of refaulted/(evicted+protected) from the first tier and the
281 + * rest infer whether pages accessed multiple times through file descriptors
282 + * are statistically hot and thus worth protecting.
283 + *
284 + * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
285 + * number of categories of the active/inactive LRU when keeping track of
286 + * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
287 + * page->flags.
288 + */
289 +#define MAX_NR_TIERS 4U
290 +
291 #ifndef __GENERATING_BOUNDS_H
292
293 struct lruvec;
294 @@ -341,6 +363,16 @@ enum {
295 LRU_GEN_FILE,
296 };
297
298 +#define MIN_LRU_BATCH BITS_PER_LONG
299 +#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
300 +
301 +/* whether to keep historical stats from evicted generations */
302 +#ifdef CONFIG_LRU_GEN_STATS
303 +#define NR_HIST_GENS MAX_NR_GENS
304 +#else
305 +#define NR_HIST_GENS 1U
306 +#endif
307 +
308 /*
309 * The youngest generation number is stored in max_seq for both anon and file
310 * types as they are aged on an equal footing. The oldest generation numbers are
311 @@ -363,6 +395,15 @@ struct lru_gen_struct {
312 struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
313 /* the multi-gen LRU sizes, eventually consistent */
314 long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
315 + /* the exponential moving average of refaulted */
316 + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
317 + /* the exponential moving average of evicted+protected */
318 + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
319 + /* the first tier doesn't need protection, hence the minus one */
320 + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
321 + /* can be modified without holding the LRU lock */
322 + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
323 + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
324 };
325
326 void lru_gen_init_lruvec(struct lruvec *lruvec);
327 --- a/include/linux/page-flags-layout.h
328 +++ b/include/linux/page-flags-layout.h
329 @@ -106,7 +106,10 @@
330 #error "Not enough bits in page flags"
331 #endif
332
333 -#define LRU_REFS_WIDTH 0
334 +/* see the comment on MAX_NR_TIERS */
335 +#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
336 + ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
337 + NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
338
339 #endif
340 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
341 --- a/kernel/bounds.c
342 +++ b/kernel/bounds.c
343 @@ -24,8 +24,10 @@ int main(void)
344 DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
345 #ifdef CONFIG_LRU_GEN
346 DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
347 + DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
348 #else
349 DEFINE(LRU_GEN_WIDTH, 0);
350 + DEFINE(__LRU_REFS_WIDTH, 0);
351 #endif
352 /* End of constants */
353
354 --- a/mm/Kconfig
355 +++ b/mm/Kconfig
356 @@ -897,6 +897,7 @@ config IO_MAPPING
357 config SECRETMEM
358 def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
359
360 +# multi-gen LRU {
361 config LRU_GEN
362 bool "Multi-Gen LRU"
363 depends on MMU
364 @@ -905,6 +906,16 @@ config LRU_GEN
365 help
366 A high performance LRU implementation to overcommit memory.
367
368 +config LRU_GEN_STATS
369 + bool "Full stats for debugging"
370 + depends on LRU_GEN
371 + help
372 + Do not enable this option unless you plan to look at historical stats
373 + from evicted generations for debugging purpose.
374 +
375 + This option has a per-memcg and per-node memory overhead.
376 +# }
377 +
378 source "mm/damon/Kconfig"
379
380 endmenu
381 --- a/mm/swap.c
382 +++ b/mm/swap.c
383 @@ -389,6 +389,40 @@ static void __lru_cache_activate_page(st
384 local_unlock(&lru_pvecs.lock);
385 }
386
387 +#ifdef CONFIG_LRU_GEN
388 +static void page_inc_refs(struct page *page)
389 +{
390 + unsigned long new_flags, old_flags = READ_ONCE(page->flags);
391 +
392 + if (PageUnevictable(page))
393 + return;
394 +
395 + if (!PageReferenced(page)) {
396 + SetPageReferenced(page);
397 + return;
398 + }
399 +
400 + if (!PageWorkingset(page)) {
401 + SetPageWorkingset(page);
402 + return;
403 + }
404 +
405 + /* see the comment on MAX_NR_TIERS */
406 + do {
407 + new_flags = old_flags & LRU_REFS_MASK;
408 + if (new_flags == LRU_REFS_MASK)
409 + break;
410 +
411 + new_flags += BIT(LRU_REFS_PGOFF);
412 + new_flags |= old_flags & ~LRU_REFS_MASK;
413 + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
414 +}
415 +#else
416 +static void page_inc_refs(struct page *page)
417 +{
418 +}
419 +#endif /* CONFIG_LRU_GEN */
420 +
421 /*
422 * Mark a page as having seen activity.
423 *
424 @@ -403,6 +437,11 @@ void mark_page_accessed(struct page *pag
425 {
426 page = compound_head(page);
427
428 + if (lru_gen_enabled()) {
429 + page_inc_refs(page);
430 + return;
431 + }
432 +
433 if (!PageReferenced(page)) {
434 SetPageReferenced(page);
435 } else if (PageUnevictable(page)) {
436 --- a/mm/vmscan.c
437 +++ b/mm/vmscan.c
438 @@ -1142,9 +1142,11 @@ static int __remove_mapping(struct addre
439
440 if (PageSwapCache(page)) {
441 swp_entry_t swap = { .val = page_private(page) };
442 - mem_cgroup_swapout(page, swap);
443 +
444 + /* get a shadow entry before mem_cgroup_swapout() clears page_memcg() */
445 if (reclaimed && !mapping_exiting(mapping))
446 shadow = workingset_eviction(page, target_memcg);
447 + mem_cgroup_swapout(page, swap);
448 __delete_from_swap_cache(page, swap, shadow);
449 xa_unlock_irq(&mapping->i_pages);
450 put_swap_page(page, swap);
451 @@ -2502,6 +2504,9 @@ static void prepare_scan_count(pg_data_t
452 unsigned long file;
453 struct lruvec *target_lruvec;
454
455 + if (lru_gen_enabled())
456 + return;
457 +
458 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
459
460 /*
461 @@ -2827,6 +2832,17 @@ static bool can_age_anon_pages(struct pg
462 * shorthand helpers
463 ******************************************************************************/
464
465 +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
466 +
467 +#define DEFINE_MAX_SEQ(lruvec) \
468 + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
469 +
470 +#define DEFINE_MIN_SEQ(lruvec) \
471 + unsigned long min_seq[ANON_AND_FILE] = { \
472 + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
473 + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
474 + }
475 +
476 #define for_each_gen_type_zone(gen, type, zone) \
477 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
478 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
479 @@ -2852,6 +2868,745 @@ static struct lruvec __maybe_unused *get
480 return pgdat ? &pgdat->__lruvec : NULL;
481 }
482
483 +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
484 +{
485 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
486 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
487 +
488 + if (!can_demote(pgdat->node_id, sc) &&
489 + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
490 + return 0;
491 +
492 + return mem_cgroup_swappiness(memcg);
493 +}
494 +
495 +static int get_nr_gens(struct lruvec *lruvec, int type)
496 +{
497 + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
498 +}
499 +
500 +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
501 +{
502 + /* see the comment on lru_gen_struct */
503 + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
504 + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
505 + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
506 +}
507 +
508 +/******************************************************************************
509 + * refault feedback loop
510 + ******************************************************************************/
511 +
512 +/*
513 + * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
514 + *
515 + * The P term is refaulted/(evicted+protected) from a tier in the generation
516 + * currently being evicted; the I term is the exponential moving average of the
517 + * P term over the generations previously evicted, using the smoothing factor
518 + * 1/2; the D term isn't supported.
519 + *
520 + * The setpoint (SP) is always the first tier of one type; the process variable
521 + * (PV) is either any tier of the other type or any other tier of the same
522 + * type.
523 + *
524 + * The error is the difference between the SP and the PV; the correction is to
525 + * turn off protection when SP>PV or turn on protection when SP<PV.
526 + *
527 + * For future optimizations:
528 + * 1. The D term may discount the other two terms over time so that long-lived
529 + * generations can resist stale information.
530 + */
531 +struct ctrl_pos {
532 + unsigned long refaulted;
533 + unsigned long total;
534 + int gain;
535 +};
536 +
537 +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
538 + struct ctrl_pos *pos)
539 +{
540 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
541 + int hist = lru_hist_from_seq(lrugen->min_seq[type]);
542 +
543 + pos->refaulted = lrugen->avg_refaulted[type][tier] +
544 + atomic_long_read(&lrugen->refaulted[hist][type][tier]);
545 + pos->total = lrugen->avg_total[type][tier] +
546 + atomic_long_read(&lrugen->evicted[hist][type][tier]);
547 + if (tier)
548 + pos->total += lrugen->protected[hist][type][tier - 1];
549 + pos->gain = gain;
550 +}
551 +
552 +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
553 +{
554 + int hist, tier;
555 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
556 + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
557 + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
558 +
559 + lockdep_assert_held(&lruvec->lru_lock);
560 +
561 + if (!carryover && !clear)
562 + return;
563 +
564 + hist = lru_hist_from_seq(seq);
565 +
566 + for (tier = 0; tier < MAX_NR_TIERS; tier++) {
567 + if (carryover) {
568 + unsigned long sum;
569 +
570 + sum = lrugen->avg_refaulted[type][tier] +
571 + atomic_long_read(&lrugen->refaulted[hist][type][tier]);
572 + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
573 +
574 + sum = lrugen->avg_total[type][tier] +
575 + atomic_long_read(&lrugen->evicted[hist][type][tier]);
576 + if (tier)
577 + sum += lrugen->protected[hist][type][tier - 1];
578 + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
579 + }
580 +
581 + if (clear) {
582 + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
583 + atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
584 + if (tier)
585 + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
586 + }
587 + }
588 +}
589 +
590 +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
591 +{
592 + /*
593 + * Return true if the PV has a limited number of refaults or a lower
594 + * refaulted/total than the SP.
595 + */
596 + return pv->refaulted < MIN_LRU_BATCH ||
597 + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
598 + (sp->refaulted + 1) * pv->total * pv->gain;
599 +}
600 +
601 +/******************************************************************************
602 + * the aging
603 + ******************************************************************************/
604 +
605 +/* protect pages accessed multiple times through file descriptors */
606 +static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
607 +{
608 + int type = page_is_file_lru(page);
609 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
610 + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
611 + unsigned long new_flags, old_flags = READ_ONCE(page->flags);
612 +
613 + VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
614 +
615 + do {
616 + new_gen = (old_gen + 1) % MAX_NR_GENS;
617 +
618 + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
619 + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
620 + /* for end_page_writeback() */
621 + if (reclaiming)
622 + new_flags |= BIT(PG_reclaim);
623 + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
624 +
625 + lru_gen_update_size(lruvec, page, old_gen, new_gen);
626 +
627 + return new_gen;
628 +}
629 +
630 +static void inc_min_seq(struct lruvec *lruvec, int type)
631 +{
632 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
633 +
634 + reset_ctrl_pos(lruvec, type, true);
635 + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
636 +}
637 +
638 +static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
639 +{
640 + int gen, type, zone;
641 + bool success = false;
642 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
643 + DEFINE_MIN_SEQ(lruvec);
644 +
645 + VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
646 +
647 + /* find the oldest populated generation */
648 + for (type = !can_swap; type < ANON_AND_FILE; type++) {
649 + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
650 + gen = lru_gen_from_seq(min_seq[type]);
651 +
652 + for (zone = 0; zone < MAX_NR_ZONES; zone++) {
653 + if (!list_empty(&lrugen->lists[gen][type][zone]))
654 + goto next;
655 + }
656 +
657 + min_seq[type]++;
658 + }
659 +next:
660 + ;
661 + }
662 +
663 + /* see the comment on lru_gen_struct */
664 + if (can_swap) {
665 + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
666 + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
667 + }
668 +
669 + for (type = !can_swap; type < ANON_AND_FILE; type++) {
670 + if (min_seq[type] == lrugen->min_seq[type])
671 + continue;
672 +
673 + reset_ctrl_pos(lruvec, type, true);
674 + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
675 + success = true;
676 + }
677 +
678 + return success;
679 +}
680 +
681 +static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
682 +{
683 + int prev, next;
684 + int type, zone;
685 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
686 +
687 + spin_lock_irq(&lruvec->lru_lock);
688 +
689 + VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
690 +
691 + if (max_seq != lrugen->max_seq)
692 + goto unlock;
693 +
694 + for (type = ANON_AND_FILE - 1; type >= 0; type--) {
695 + if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
696 + continue;
697 +
698 + VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
699 +
700 + inc_min_seq(lruvec, type);
701 + }
702 +
703 + /*
704 + * Update the active/inactive LRU sizes for compatibility. Both sides of
705 + * the current max_seq need to be covered, since max_seq+1 can overlap
706 + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
707 + * overlap, cold/hot inversion happens.
708 + */
709 + prev = lru_gen_from_seq(lrugen->max_seq - 1);
710 + next = lru_gen_from_seq(lrugen->max_seq + 1);
711 +
712 + for (type = 0; type < ANON_AND_FILE; type++) {
713 + for (zone = 0; zone < MAX_NR_ZONES; zone++) {
714 + enum lru_list lru = type * LRU_INACTIVE_FILE;
715 + long delta = lrugen->nr_pages[prev][type][zone] -
716 + lrugen->nr_pages[next][type][zone];
717 +
718 + if (!delta)
719 + continue;
720 +
721 + __update_lru_size(lruvec, lru, zone, delta);
722 + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
723 + }
724 + }
725 +
726 + for (type = 0; type < ANON_AND_FILE; type++)
727 + reset_ctrl_pos(lruvec, type, false);
728 +
729 + /* make sure preceding modifications appear */
730 + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
731 +unlock:
732 + spin_unlock_irq(&lruvec->lru_lock);
733 +}
734 +
735 +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
736 + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
737 +{
738 + int gen, type, zone;
739 + unsigned long old = 0;
740 + unsigned long young = 0;
741 + unsigned long total = 0;
742 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
743 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
744 +
745 + for (type = !can_swap; type < ANON_AND_FILE; type++) {
746 + unsigned long seq;
747 +
748 + for (seq = min_seq[type]; seq <= max_seq; seq++) {
749 + unsigned long size = 0;
750 +
751 + gen = lru_gen_from_seq(seq);
752 +
753 + for (zone = 0; zone < MAX_NR_ZONES; zone++)
754 + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
755 +
756 + total += size;
757 + if (seq == max_seq)
758 + young += size;
759 + else if (seq + MIN_NR_GENS == max_seq)
760 + old += size;
761 + }
762 + }
763 +
764 + /* try to scrape all its memory if this memcg was deleted */
765 + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
766 +
767 + /*
768 + * The aging tries to be lazy to reduce the overhead, while the eviction
769 + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
770 + * ideal number of generations is MIN_NR_GENS+1.
771 + */
772 + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
773 + return true;
774 + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
775 + return false;
776 +
777 + /*
778 + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
779 + * of the total number of pages for each generation. A reasonable range
780 + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
781 + * aging cares about the upper bound of hot pages, while the eviction
782 + * cares about the lower bound of cold pages.
783 + */
784 + if (young * MIN_NR_GENS > total)
785 + return true;
786 + if (old * (MIN_NR_GENS + 2) < total)
787 + return true;
788 +
789 + return false;
790 +}
791 +
792 +static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
793 +{
794 + bool need_aging;
795 + unsigned long nr_to_scan;
796 + int swappiness = get_swappiness(lruvec, sc);
797 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
798 + DEFINE_MAX_SEQ(lruvec);
799 + DEFINE_MIN_SEQ(lruvec);
800 +
801 + VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
802 +
803 + mem_cgroup_calculate_protection(NULL, memcg);
804 +
805 + if (mem_cgroup_below_min(memcg))
806 + return;
807 +
808 + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
809 + if (need_aging)
810 + inc_max_seq(lruvec, max_seq, swappiness);
811 +}
812 +
813 +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
814 +{
815 + struct mem_cgroup *memcg;
816 +
817 + VM_WARN_ON_ONCE(!current_is_kswapd());
818 +
819 + memcg = mem_cgroup_iter(NULL, NULL, NULL);
820 + do {
821 + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
822 +
823 + age_lruvec(lruvec, sc);
824 +
825 + cond_resched();
826 + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
827 +}
828 +
829 +/******************************************************************************
830 + * the eviction
831 + ******************************************************************************/
832 +
833 +static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx)
834 +{
835 + bool success;
836 + int gen = page_lru_gen(page);
837 + int type = page_is_file_lru(page);
838 + int zone = page_zonenum(page);
839 + int delta = thp_nr_pages(page);
840 + int refs = page_lru_refs(page);
841 + int tier = lru_tier_from_refs(refs);
842 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
843 +
844 + VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
845 +
846 + /* unevictable */
847 + if (!page_evictable(page)) {
848 + success = lru_gen_del_page(lruvec, page, true);
849 + VM_WARN_ON_ONCE_PAGE(!success, page);
850 + SetPageUnevictable(page);
851 + add_page_to_lru_list(page, lruvec);
852 + __count_vm_events(UNEVICTABLE_PGCULLED, delta);
853 + return true;
854 + }
855 +
856 + /* dirty lazyfree */
857 + if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) {
858 + success = lru_gen_del_page(lruvec, page, true);
859 + VM_WARN_ON_ONCE_PAGE(!success, page);
860 + SetPageSwapBacked(page);
861 + add_page_to_lru_list_tail(page, lruvec);
862 + return true;
863 + }
864 +
865 + /* protected */
866 + if (tier > tier_idx) {
867 + int hist = lru_hist_from_seq(lrugen->min_seq[type]);
868 +
869 + gen = page_inc_gen(lruvec, page, false);
870 + list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
871 +
872 + WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
873 + lrugen->protected[hist][type][tier - 1] + delta);
874 + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
875 + return true;
876 + }
877 +
878 + /* waiting for writeback */
879 + if (PageLocked(page) || PageWriteback(page) ||
880 + (type == LRU_GEN_FILE && PageDirty(page))) {
881 + gen = page_inc_gen(lruvec, page, true);
882 + list_move(&page->lru, &lrugen->lists[gen][type][zone]);
883 + return true;
884 + }
885 +
886 + return false;
887 +}
888 +
889 +static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc)
890 +{
891 + bool success;
892 +
893 + /* unmapping inhibited */
894 + if (!sc->may_unmap && page_mapped(page))
895 + return false;
896 +
897 + /* swapping inhibited */
898 + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
899 + (PageDirty(page) ||
900 + (PageAnon(page) && !PageSwapCache(page))))
901 + return false;
902 +
903 + /* raced with release_pages() */
904 + if (!get_page_unless_zero(page))
905 + return false;
906 +
907 + /* raced with another isolation */
908 + if (!TestClearPageLRU(page)) {
909 + put_page(page);
910 + return false;
911 + }
912 +
913 + /* see the comment on MAX_NR_TIERS */
914 + if (!PageReferenced(page))
915 + set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
916 +
917 + /* for shrink_page_list() */
918 + ClearPageReclaim(page);
919 + ClearPageReferenced(page);
920 +
921 + success = lru_gen_del_page(lruvec, page, true);
922 + VM_WARN_ON_ONCE_PAGE(!success, page);
923 +
924 + return true;
925 +}
926 +
927 +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
928 + int type, int tier, struct list_head *list)
929 +{
930 + int gen, zone;
931 + enum vm_event_item item;
932 + int sorted = 0;
933 + int scanned = 0;
934 + int isolated = 0;
935 + int remaining = MAX_LRU_BATCH;
936 + struct lru_gen_struct *lrugen = &lruvec->lrugen;
937 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
938 +
939 + VM_WARN_ON_ONCE(!list_empty(list));
940 +
941 + if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
942 + return 0;
943 +
944 + gen = lru_gen_from_seq(lrugen->min_seq[type]);
945 +
946 + for (zone = sc->reclaim_idx; zone >= 0; zone--) {
947 + LIST_HEAD(moved);
948 + int skipped = 0;
949 + struct list_head *head = &lrugen->lists[gen][type][zone];
950 +
951 + while (!list_empty(head)) {
952 + struct page *page = lru_to_page(head);
953 + int delta = thp_nr_pages(page);
954 +
955 + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
956 + VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
957 + VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
958 + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
959 +
960 + scanned += delta;
961 +
962 + if (sort_page(lruvec, page, tier))
963 + sorted += delta;
964 + else if (isolate_page(lruvec, page, sc)) {
965 + list_add(&page->lru, list);
966 + isolated += delta;
967 + } else {
968 + list_move(&page->lru, &moved);
969 + skipped += delta;
970 + }
971 +
972 + if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
973 + break;
974 + }
975 +
976 + if (skipped) {
977 + list_splice(&moved, head);
978 + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
979 + }
980 +
981 + if (!remaining || isolated >= MIN_LRU_BATCH)
982 + break;
983 + }
984 +
985 + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
986 + if (!cgroup_reclaim(sc)) {
987 + __count_vm_events(item, isolated);
988 + __count_vm_events(PGREFILL, sorted);
989 + }
990 + __count_memcg_events(memcg, item, isolated);
991 + __count_memcg_events(memcg, PGREFILL, sorted);
992 + __count_vm_events(PGSCAN_ANON + type, isolated);
993 +
994 + /*
995 + * There might not be eligible pages due to reclaim_idx, may_unmap and
996 + * may_writepage. Check the remaining to prevent livelock if it's not
997 + * making progress.
998 + */
999 + return isolated || !remaining ? scanned : 0;
1000 +}
1001 +
1002 +static int get_tier_idx(struct lruvec *lruvec, int type)
1003 +{
1004 + int tier;
1005 + struct ctrl_pos sp, pv;
1006 +
1007 + /*
1008 + * To leave a margin for fluctuations, use a larger gain factor (1:2).
1009 + * This value is chosen because any other tier would have at least twice
1010 + * as many refaults as the first tier.
1011 + */
1012 + read_ctrl_pos(lruvec, type, 0, 1, &sp);
1013 + for (tier = 1; tier < MAX_NR_TIERS; tier++) {
1014 + read_ctrl_pos(lruvec, type, tier, 2, &pv);
1015 + if (!positive_ctrl_err(&sp, &pv))
1016 + break;
1017 + }
1018 +
1019 + return tier - 1;
1020 +}
1021 +
1022 +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
1023 +{
1024 + int type, tier;
1025 + struct ctrl_pos sp, pv;
1026 + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
1027 +
1028 + /*
1029 + * Compare the first tier of anon with that of file to determine which
1030 + * type to scan. Also need to compare other tiers of the selected type
1031 + * with the first tier of the other type to determine the last tier (of
1032 + * the selected type) to evict.
1033 + */
1034 + read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
1035 + read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
1036 + type = positive_ctrl_err(&sp, &pv);
1037 +
1038 + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
1039 + for (tier = 1; tier < MAX_NR_TIERS; tier++) {
1040 + read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
1041 + if (!positive_ctrl_err(&sp, &pv))
1042 + break;
1043 + }
1044 +
1045 + *tier_idx = tier - 1;
1046 +
1047 + return type;
1048 +}
1049 +
1050 +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
1051 + int *type_scanned, struct list_head *list)
1052 +{
1053 + int i;
1054 + int type;
1055 + int scanned;
1056 + int tier = -1;
1057 + DEFINE_MIN_SEQ(lruvec);
1058 +
1059 + /*
1060 + * Try to make the obvious choice first. When anon and file are both
1061 + * available from the same generation, interpret swappiness 1 as file
1062 + * first and 200 as anon first.
1063 + */
1064 + if (!swappiness)
1065 + type = LRU_GEN_FILE;
1066 + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
1067 + type = LRU_GEN_ANON;
1068 + else if (swappiness == 1)
1069 + type = LRU_GEN_FILE;
1070 + else if (swappiness == 200)
1071 + type = LRU_GEN_ANON;
1072 + else
1073 + type = get_type_to_scan(lruvec, swappiness, &tier);
1074 +
1075 + for (i = !swappiness; i < ANON_AND_FILE; i++) {
1076 + if (tier < 0)
1077 + tier = get_tier_idx(lruvec, type);
1078 +
1079 + scanned = scan_pages(lruvec, sc, type, tier, list);
1080 + if (scanned)
1081 + break;
1082 +
1083 + type = !type;
1084 + tier = -1;
1085 + }
1086 +
1087 + *type_scanned = type;
1088 +
1089 + return scanned;
1090 +}
1091 +
1092 +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
1093 +{
1094 + int type;
1095 + int scanned;
1096 + int reclaimed;
1097 + LIST_HEAD(list);
1098 + struct page *page;
1099 + enum vm_event_item item;
1100 + struct reclaim_stat stat;
1101 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
1102 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1103 +
1104 + spin_lock_irq(&lruvec->lru_lock);
1105 +
1106 + scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
1107 +
1108 + scanned += try_to_inc_min_seq(lruvec, swappiness);
1109 +
1110 + if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
1111 + scanned = 0;
1112 +
1113 + spin_unlock_irq(&lruvec->lru_lock);
1114 +
1115 + if (list_empty(&list))
1116 + return scanned;
1117 +
1118 + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
1119 +
1120 + list_for_each_entry(page, &list, lru) {
1121 + /* restore LRU_REFS_FLAGS cleared by isolate_page() */
1122 + if (PageWorkingset(page))
1123 + SetPageReferenced(page);
1124 +
1125 + /* don't add rejected pages to the oldest generation */
1126 + if (PageReclaim(page) &&
1127 + (PageDirty(page) || PageWriteback(page)))
1128 + ClearPageActive(page);
1129 + else
1130 + SetPageActive(page);
1131 + }
1132 +
1133 + spin_lock_irq(&lruvec->lru_lock);
1134 +
1135 + move_pages_to_lru(lruvec, &list);
1136 +
1137 + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
1138 + if (!cgroup_reclaim(sc))
1139 + __count_vm_events(item, reclaimed);
1140 + __count_memcg_events(memcg, item, reclaimed);
1141 + __count_vm_events(PGSTEAL_ANON + type, reclaimed);
1142 +
1143 + spin_unlock_irq(&lruvec->lru_lock);
1144 +
1145 + mem_cgroup_uncharge_list(&list);
1146 + free_unref_page_list(&list);
1147 +
1148 + sc->nr_reclaimed += reclaimed;
1149 +
1150 + return scanned;
1151 +}
1152 +
1153 +static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
1154 + bool can_swap)
1155 +{
1156 + bool need_aging;
1157 + unsigned long nr_to_scan;
1158 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
1159 + DEFINE_MAX_SEQ(lruvec);
1160 + DEFINE_MIN_SEQ(lruvec);
1161 +
1162 + if (mem_cgroup_below_min(memcg) ||
1163 + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
1164 + return 0;
1165 +
1166 + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
1167 + if (!need_aging)
1168 + return nr_to_scan;
1169 +
1170 + /* skip the aging path at the default priority */
1171 + if (sc->priority == DEF_PRIORITY)
1172 + goto done;
1173 +
1174 + /* leave the work to lru_gen_age_node() */
1175 + if (current_is_kswapd())
1176 + return 0;
1177 +
1178 + inc_max_seq(lruvec, max_seq, can_swap);
1179 +done:
1180 + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
1181 +}
1182 +
1183 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1184 +{
1185 + struct blk_plug plug;
1186 + unsigned long scanned = 0;
1187 +
1188 + lru_add_drain();
1189 +
1190 + blk_start_plug(&plug);
1191 +
1192 + while (true) {
1193 + int delta;
1194 + int swappiness;
1195 + unsigned long nr_to_scan;
1196 +
1197 + if (sc->may_swap)
1198 + swappiness = get_swappiness(lruvec, sc);
1199 + else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
1200 + swappiness = 1;
1201 + else
1202 + swappiness = 0;
1203 +
1204 + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
1205 + if (!nr_to_scan)
1206 + break;
1207 +
1208 + delta = evict_pages(lruvec, sc, swappiness);
1209 + if (!delta)
1210 + break;
1211 +
1212 + scanned += delta;
1213 + if (scanned >= nr_to_scan)
1214 + break;
1215 +
1216 + cond_resched();
1217 + }
1218 +
1219 + blk_finish_plug(&plug);
1220 +}
1221 +
1222 /******************************************************************************
1223 * initialization
1224 ******************************************************************************/
1225 @@ -2894,6 +3649,16 @@ static int __init init_lru_gen(void)
1226 };
1227 late_initcall(init_lru_gen);
1228
1229 +#else /* !CONFIG_LRU_GEN */
1230 +
1231 +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
1232 +{
1233 +}
1234 +
1235 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1236 +{
1237 +}
1238 +
1239 #endif /* CONFIG_LRU_GEN */
1240
1241 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1242 @@ -2907,6 +3672,11 @@ static void shrink_lruvec(struct lruvec
1243 bool proportional_reclaim;
1244 struct blk_plug plug;
1245
1246 + if (lru_gen_enabled()) {
1247 + lru_gen_shrink_lruvec(lruvec, sc);
1248 + return;
1249 + }
1250 +
1251 get_scan_count(lruvec, sc, nr);
1252
1253 /* Record the original scan target for proportional adjustments later */
1254 @@ -3375,6 +4145,9 @@ static void snapshot_refaults(struct mem
1255 struct lruvec *target_lruvec;
1256 unsigned long refaults;
1257
1258 + if (lru_gen_enabled())
1259 + return;
1260 +
1261 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
1262 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
1263 target_lruvec->refaults[0] = refaults;
1264 @@ -3739,12 +4512,16 @@ unsigned long try_to_free_mem_cgroup_pag
1265 }
1266 #endif
1267
1268 -static void age_active_anon(struct pglist_data *pgdat,
1269 - struct scan_control *sc)
1270 +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
1271 {
1272 struct mem_cgroup *memcg;
1273 struct lruvec *lruvec;
1274
1275 + if (lru_gen_enabled()) {
1276 + lru_gen_age_node(pgdat, sc);
1277 + return;
1278 + }
1279 +
1280 if (!can_age_anon_pages(pgdat, sc))
1281 return;
1282
1283 @@ -4061,12 +4838,11 @@ restart:
1284 sc.may_swap = !nr_boost_reclaim;
1285
1286 /*
1287 - * Do some background aging of the anon list, to give
1288 - * pages a chance to be referenced before reclaiming. All
1289 - * pages are rotated regardless of classzone as this is
1290 - * about consistent aging.
1291 + * Do some background aging, to give pages a chance to be
1292 + * referenced before reclaiming. All pages are rotated
1293 + * regardless of classzone as this is about consistent aging.
1294 */
1295 - age_active_anon(pgdat, &sc);
1296 + kswapd_age_node(pgdat, &sc);
1297
1298 /*
1299 * If we're getting trouble reclaiming, start doing writepage
1300 --- a/mm/workingset.c
1301 +++ b/mm/workingset.c
1302 @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
1303 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
1304 bool workingset)
1305 {
1306 - eviction >>= bucket_order;
1307 eviction &= EVICTION_MASK;
1308 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
1309 eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
1310 @@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow,
1311
1312 *memcgidp = memcgid;
1313 *pgdat = NODE_DATA(nid);
1314 - *evictionp = entry << bucket_order;
1315 + *evictionp = entry;
1316 *workingsetp = workingset;
1317 }
1318
1319 +#ifdef CONFIG_LRU_GEN
1320 +
1321 +static void *lru_gen_eviction(struct page *page)
1322 +{
1323 + int hist;
1324 + unsigned long token;
1325 + unsigned long min_seq;
1326 + struct lruvec *lruvec;
1327 + struct lru_gen_struct *lrugen;
1328 + int type = page_is_file_lru(page);
1329 + int delta = thp_nr_pages(page);
1330 + int refs = page_lru_refs(page);
1331 + int tier = lru_tier_from_refs(refs);
1332 + struct mem_cgroup *memcg = page_memcg(page);
1333 + struct pglist_data *pgdat = page_pgdat(page);
1334 +
1335 + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
1336 +
1337 + lruvec = mem_cgroup_lruvec(memcg, pgdat);
1338 + lrugen = &lruvec->lrugen;
1339 + min_seq = READ_ONCE(lrugen->min_seq[type]);
1340 + token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
1341 +
1342 + hist = lru_hist_from_seq(min_seq);
1343 + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
1344 +
1345 + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
1346 +}
1347 +
1348 +static void lru_gen_refault(struct page *page, void *shadow)
1349 +{
1350 + int hist, tier, refs;
1351 + int memcg_id;
1352 + bool workingset;
1353 + unsigned long token;
1354 + unsigned long min_seq;
1355 + struct lruvec *lruvec;
1356 + struct lru_gen_struct *lrugen;
1357 + struct mem_cgroup *memcg;
1358 + struct pglist_data *pgdat;
1359 + int type = page_is_file_lru(page);
1360 + int delta = thp_nr_pages(page);
1361 +
1362 + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
1363 +
1364 + if (pgdat != page_pgdat(page))
1365 + return;
1366 +
1367 + rcu_read_lock();
1368 +
1369 + memcg = page_memcg_rcu(page);
1370 + if (memcg_id != mem_cgroup_id(memcg))
1371 + goto unlock;
1372 +
1373 + lruvec = mem_cgroup_lruvec(memcg, pgdat);
1374 + lrugen = &lruvec->lrugen;
1375 +
1376 + min_seq = READ_ONCE(lrugen->min_seq[type]);
1377 + if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
1378 + goto unlock;
1379 +
1380 + hist = lru_hist_from_seq(min_seq);
1381 + /* see the comment in page_lru_refs() */
1382 + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
1383 + tier = lru_tier_from_refs(refs);
1384 +
1385 + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
1386 + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
1387 +
1388 + /*
1389 + * Count the following two cases as stalls:
1390 + * 1. For pages accessed through page tables, hotter pages pushed out
1391 + * hot pages which refaulted immediately.
1392 + * 2. For pages accessed multiple times through file descriptors,
1393 + * numbers of accesses might have been out of the range.
1394 + */
1395 + if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
1396 + SetPageWorkingset(page);
1397 + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
1398 + }
1399 +unlock:
1400 + rcu_read_unlock();
1401 +}
1402 +
1403 +#else /* !CONFIG_LRU_GEN */
1404 +
1405 +static void *lru_gen_eviction(struct page *page)
1406 +{
1407 + return NULL;
1408 +}
1409 +
1410 +static void lru_gen_refault(struct page *page, void *shadow)
1411 +{
1412 +}
1413 +
1414 +#endif /* CONFIG_LRU_GEN */
1415 +
1416 /**
1417 * workingset_age_nonresident - age non-resident entries as LRU ages
1418 * @lruvec: the lruvec that was aged
1419 @@ -264,10 +360,14 @@ void *workingset_eviction(struct page *p
1420 VM_BUG_ON_PAGE(page_count(page), page);
1421 VM_BUG_ON_PAGE(!PageLocked(page), page);
1422
1423 + if (lru_gen_enabled())
1424 + return lru_gen_eviction(page);
1425 +
1426 lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
1427 /* XXX: target_memcg can be NULL, go through lruvec */
1428 memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
1429 eviction = atomic_long_read(&lruvec->nonresident_age);
1430 + eviction >>= bucket_order;
1431 workingset_age_nonresident(lruvec, thp_nr_pages(page));
1432 return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
1433 }
1434 @@ -296,7 +396,13 @@ void workingset_refault(struct page *pag
1435 bool workingset;
1436 int memcgid;
1437
1438 + if (lru_gen_enabled()) {
1439 + lru_gen_refault(page, shadow);
1440 + return;
1441 + }
1442 +
1443 unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
1444 + eviction <<= bucket_order;
1445
1446 rcu_read_lock();
1447 /*