ramips: add support for Keenetic Lite III rev. A
[openwrt/staging/dedeckeh.git] / target / linux / generic / backport-5.15 / 020-v6.1-07-mm-multigenerational-lru-eviction.patch
1 From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Tue, 29 Jun 2021 20:46:47 -0600
4 Subject: [PATCH 07/10] mm: multigenerational lru: eviction
5
6 The eviction consumes old generations. Given an lruvec, the eviction
7 scans pages on lrugen->lists indexed by anon and file min_seq[]
8 (modulo MAX_NR_GENS). It first tries to select a type based on the
9 values of min_seq[]. If they are equal, it selects the type that has
10 a lower refaulted %. The eviction sorts a page according to its
11 updated generation number if the aging has found this page accessed.
12 It also moves a page to the next generation if this page is from an
13 upper tier that has a higher refaulted % than the base tier. The
14 eviction increments min_seq[] of a selected type when it finds
15 lrugen->lists indexed by min_seq[] of this selected type are empty.
16
17 Each generation is divided into multiple tiers. Tiers represent
18 different ranges of numbers of accesses from file descriptors only.
19 Pages accessed N times via file descriptors belong to tier
20 order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers,
21 and they require additional MAX_NR_TIERS-2 bits in page->flags. In
22 contrast to moving between generations which requires list operations,
23 moving between tiers only involves operations on page->flags and
24 therefore has a negligible cost. A feedback loop modeled after the PID
25 controller monitors refaulted % across all tiers and decides when to
26 protect pages from which tiers.
27
28 Unmapped pages are initially added to the oldest generation and then
29 conditionally protected by tiers. Each tier keeps track of how many
30 pages from it have refaulted. Tier 0 is the base tier and pages from
31 it are evicted unconditionally because there are no better candidates.
32 Pages from an upper tier are either evicted or moved to the next
33 generation, depending on whether this upper tier has a higher
34 refaulted % than the base tier. This model has the following
35 advantages:
36 1) It removes the cost in the buffered access path and reduces the
37 overall cost of protection because pages are conditionally protected
38 in the reclaim path.
39 2) It takes mapped pages into account and avoids overprotecting
40 pages accessed multiple times via file descriptors.
41 3 Additional tiers improve the protection of pages accessed more
42 than twice.
43
44 Signed-off-by: Yu Zhao <yuzhao@google.com>
45 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
46 Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac
47 ---
48 include/linux/mm_inline.h | 10 +
49 include/linux/mmzone.h | 33 +++
50 mm/swap.c | 42 +++
51 mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++-
52 mm/workingset.c | 120 ++++++++-
53 5 files changed, 757 insertions(+), 3 deletions(-)
54
55 --- a/include/linux/mm_inline.h
56 +++ b/include/linux/mm_inline.h
57 @@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi
58 return seq % NR_HIST_GENS;
59 }
60
61 +/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */
62 +static inline int lru_tier_from_refs(int refs)
63 +{
64 + VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
65 +
66 + return order_base_2(refs + 1);
67 +}
68 +
69 /* The youngest and the second youngest generations are counted as active. */
70 static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
71 {
72 @@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru
73 gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
74
75 new_flags &= ~LRU_GEN_MASK;
76 + if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS)
77 + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
78 /* for shrink_page_list() */
79 if (reclaiming)
80 new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
81 --- a/include/linux/mmzone.h
82 +++ b/include/linux/mmzone.h
83 @@ -319,6 +319,30 @@ struct page_vma_mapped_walk;
84 #define MIN_NR_GENS 2
85 #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
86
87 +/*
88 + * Each generation is divided into multiple tiers. Tiers represent different
89 + * ranges of numbers of accesses from file descriptors, i.e.,
90 + * mark_page_accessed(). In contrast to moving between generations which
91 + * requires the lru lock, moving between tiers only involves an atomic
92 + * operation on page->flags and therefore has a negligible cost.
93 + *
94 + * The purposes of tiers are to:
95 + * 1) estimate whether pages accessed multiple times via file descriptors are
96 + * more active than pages accessed only via page tables by separating the two
97 + * access types into upper tiers and the base tier, and comparing refaulted %
98 + * across all tiers.
99 + * 2) improve buffered io performance by deferring the protection of pages
100 + * accessed multiple times until the eviction. That is the protection happens
101 + * in the reclaim path, not the access path.
102 + *
103 + * Pages accessed N times via file descriptors belong to tier order_base_2(N).
104 + * The base tier may be marked by PageReferenced(). All upper tiers are marked
105 + * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are
106 + * used to support more than one upper tier.
107 + */
108 +#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN)
109 +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
110 +
111 /* Whether to keep stats for historical generations. */
112 #ifdef CONFIG_LRU_GEN_STATS
113 #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
114 @@ -337,6 +361,15 @@ struct lrugen {
115 struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
116 /* the sizes of the multigenerational lru lists in pages */
117 unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
118 + /* the exponential moving average of refaulted */
119 + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
120 + /* the exponential moving average of protected+evicted */
121 + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
122 + /* the base tier isn't protected, hence the minus one */
123 + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
124 + /* incremented without holding the lru lock */
125 + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
126 + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
127 /* whether the multigenerational lru is enabled */
128 bool enabled[ANON_AND_FILE];
129 };
130 --- a/mm/swap.c
131 +++ b/mm/swap.c
132 @@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st
133 local_unlock(&lru_pvecs.lock);
134 }
135
136 +#ifdef CONFIG_LRU_GEN
137 +static void page_inc_refs(struct page *page)
138 +{
139 + unsigned long refs;
140 + unsigned long old_flags, new_flags;
141 +
142 + if (PageUnevictable(page))
143 + return;
144 +
145 + /* see the comment on MAX_NR_TIERS */
146 + do {
147 + new_flags = old_flags = READ_ONCE(page->flags);
148 +
149 + if (!(new_flags & BIT(PG_referenced))) {
150 + new_flags |= BIT(PG_referenced);
151 + continue;
152 + }
153 +
154 + if (!(new_flags & BIT(PG_workingset))) {
155 + new_flags |= BIT(PG_workingset);
156 + continue;
157 + }
158 +
159 + refs = new_flags & LRU_REFS_MASK;
160 + refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK);
161 +
162 + new_flags &= ~LRU_REFS_MASK;
163 + new_flags |= refs;
164 + } while (new_flags != old_flags &&
165 + cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
166 +}
167 +#else
168 +static void page_inc_refs(struct page *page)
169 +{
170 +}
171 +#endif /* CONFIG_LRU_GEN */
172 +
173 /*
174 * Mark a page as having seen activity.
175 *
176 @@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag
177 {
178 page = compound_head(page);
179
180 + if (lru_gen_enabled()) {
181 + page_inc_refs(page);
182 + return;
183 + }
184 +
185 if (!PageReferenced(page)) {
186 SetPageReferenced(page);
187 } else if (PageUnevictable(page)) {
188 --- a/mm/vmscan.c
189 +++ b/mm/vmscan.c
190 @@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre
191
192 if (PageSwapCache(page)) {
193 swp_entry_t swap = { .val = page_private(page) };
194 - mem_cgroup_swapout(page, swap);
195 +
196 + /* get a shadow entry before page_memcg() is cleared */
197 if (reclaimed && !mapping_exiting(mapping))
198 shadow = workingset_eviction(page, target_memcg);
199 + mem_cgroup_swapout(page, swap);
200 __delete_from_swap_cache(page, swap, shadow);
201 xa_unlock_irq(&mapping->i_pages);
202 put_swap_page(page, swap);
203 @@ -1410,6 +1412,11 @@ retry:
204 if (!sc->may_unmap && page_mapped(page))
205 goto keep_locked;
206
207 + /* lru_gen_look_around() has updated this page? */
208 + if (lru_gen_enabled() && !ignore_references &&
209 + page_mapped(page) && PageReferenced(page))
210 + goto keep_locked;
211 +
212 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
213 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
214
215 @@ -2505,6 +2512,9 @@ static void prepare_scan_count(pg_data_t
216 unsigned long file;
217 struct lruvec *target_lruvec;
218
219 + if (lru_gen_enabled())
220 + return;
221 +
222 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
223
224 /*
225 @@ -2845,6 +2855,17 @@ static int page_lru_gen(struct page *pag
226 return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
227 }
228
229 +static int page_lru_tier(struct page *page)
230 +{
231 + int refs;
232 + unsigned long flags = READ_ONCE(page->flags);
233 +
234 + refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ?
235 + ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0;
236 +
237 + return lru_tier_from_refs(refs);
238 +}
239 +
240 static int get_swappiness(struct mem_cgroup *memcg)
241 {
242 return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
243 @@ -3181,6 +3202,91 @@ done:
244 }
245
246 /******************************************************************************
247 + * refault feedback loop
248 + ******************************************************************************/
249 +
250 +/*
251 + * A feedback loop modeled after the PID controller. Currently supports the
252 + * proportional (P) and the integral (I) terms; the derivative (D) term can be
253 + * added if necessary. The setpoint (SP) is the desired position; the process
254 + * variable (PV) is the measured position. The error is the difference between
255 + * the SP and the PV. A positive error results in a positive control output
256 + * correction, which, in our case, is to allow eviction.
257 + *
258 + * The P term is refaulted % of the current generation being evicted. The I
259 + * term is the exponential moving average of refaulted % of previously evicted
260 + * generations, using the smoothing factor 1/2.
261 + *
262 + * Our goal is to maintain proportional refaulted % across all tiers.
263 + */
264 +struct ctrl_pos {
265 + unsigned long refaulted;
266 + unsigned long total;
267 + int gain;
268 +};
269 +
270 +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
271 + struct ctrl_pos *pos)
272 +{
273 + struct lrugen *lrugen = &lruvec->evictable;
274 + int hist = lru_hist_from_seq(lrugen->min_seq[type]);
275 +
276 + pos->refaulted = lrugen->avg_refaulted[type][tier] +
277 + atomic_long_read(&lrugen->refaulted[hist][type][tier]);
278 + pos->total = lrugen->avg_total[type][tier] +
279 + atomic_long_read(&lrugen->evicted[hist][type][tier]);
280 + if (tier)
281 + pos->total += lrugen->protected[hist][type][tier - 1];
282 + pos->gain = gain;
283 +}
284 +
285 +static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type)
286 +{
287 + int tier;
288 + int hist = lru_hist_from_seq(gen);
289 + struct lrugen *lrugen = &lruvec->evictable;
290 + bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
291 + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
292 +
293 + if (!carryover && !clear)
294 + return;
295 +
296 + for (tier = 0; tier < MAX_NR_TIERS; tier++) {
297 + if (carryover) {
298 + unsigned long sum;
299 +
300 + sum = lrugen->avg_refaulted[type][tier] +
301 + atomic_long_read(&lrugen->refaulted[hist][type][tier]);
302 + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
303 +
304 + sum = lrugen->avg_total[type][tier] +
305 + atomic_long_read(&lrugen->evicted[hist][type][tier]);
306 + if (tier)
307 + sum += lrugen->protected[hist][type][tier - 1];
308 + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
309 + }
310 +
311 + if (clear) {
312 + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
313 + atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
314 + if (tier)
315 + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
316 + }
317 + }
318 +}
319 +
320 +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
321 +{
322 + /*
323 + * Allow eviction if the PV has a limited number of refaulted pages or a
324 + * lower refaulted % than the SP.
325 + */
326 + return pv->refaulted < MIN_BATCH_SIZE ||
327 + pv->refaulted * max(sp->total, 1UL) * sp->gain <=
328 + sp->refaulted * max(pv->total, 1UL) * pv->gain;
329 +}
330 +
331 +/******************************************************************************
332 * the aging
333 ******************************************************************************/
334
335 @@ -3200,6 +3306,7 @@ static int page_update_gen(struct page *
336
337 new_flags &= ~LRU_GEN_MASK;
338 new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
339 + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
340 } while (new_flags != old_flags &&
341 cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
342
343 @@ -3231,6 +3338,7 @@ static void page_inc_gen(struct page *pa
344
345 new_flags &= ~LRU_GEN_MASK;
346 new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
347 + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
348 /* for end_page_writeback() */
349 if (reclaiming)
350 new_flags |= BIT(PG_reclaim);
351 @@ -3722,6 +3830,7 @@ static bool inc_min_seq(struct lruvec *l
352 }
353 }
354
355 + reset_ctrl_pos(lruvec, gen, type);
356 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
357
358 return true;
359 @@ -3759,6 +3868,8 @@ next:
360 if (min_seq[type] == lrugen->min_seq[type])
361 continue;
362
363 + gen = lru_gen_from_seq(lrugen->min_seq[type]);
364 + reset_ctrl_pos(lruvec, gen, type);
365 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
366 success = true;
367 }
368 @@ -3820,6 +3931,9 @@ static void inc_max_seq(struct lruvec *l
369 }
370 }
371
372 + for (type = 0; type < ANON_AND_FILE; type++)
373 + reset_ctrl_pos(lruvec, gen, type);
374 +
375 WRITE_ONCE(lrugen->timestamps[gen], jiffies);
376 /* make sure all preceding modifications appear first */
377 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
378 @@ -4101,6 +4215,433 @@ void lru_gen_look_around(struct page_vma
379 }
380
381 /******************************************************************************
382 + * the eviction
383 + ******************************************************************************/
384 +
385 +static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx)
386 +{
387 + bool success;
388 + int gen = page_lru_gen(page);
389 + int type = page_is_file_lru(page);
390 + int zone = page_zonenum(page);
391 + int tier = page_lru_tier(page);
392 + int delta = thp_nr_pages(page);
393 + struct lrugen *lrugen = &lruvec->evictable;
394 +
395 + VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page);
396 +
397 + /* an mlocked page? */
398 + if (!page_evictable(page)) {
399 + success = lru_gen_del_page(page, lruvec, true);
400 + VM_BUG_ON_PAGE(!success, page);
401 + SetPageUnevictable(page);
402 + add_page_to_lru_list(page, lruvec);
403 + __count_vm_events(UNEVICTABLE_PGCULLED, delta);
404 + return true;
405 + }
406 +
407 + /* a lazy-free page that has been written into? */
408 + if (type && PageDirty(page) && PageAnon(page)) {
409 + success = lru_gen_del_page(page, lruvec, true);
410 + VM_BUG_ON_PAGE(!success, page);
411 + SetPageSwapBacked(page);
412 + add_page_to_lru_list_tail(page, lruvec);
413 + return true;
414 + }
415 +
416 + /* page_update_gen() has updated this page? */
417 + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
418 + list_move(&page->lru, &lrugen->lists[gen][type][zone]);
419 + return true;
420 + }
421 +
422 + /* protect this page if its tier has a higher refaulted % */
423 + if (tier > tier_idx) {
424 + int hist = lru_hist_from_seq(gen);
425 +
426 + page_inc_gen(page, lruvec, false);
427 + WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
428 + lrugen->protected[hist][type][tier - 1] + delta);
429 + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
430 + return true;
431 + }
432 +
433 + /* mark this page for reclaim if it's pending writeback */
434 + if (PageWriteback(page) || (type && PageDirty(page))) {
435 + page_inc_gen(page, lruvec, true);
436 + return true;
437 + }
438 +
439 + return false;
440 +}
441 +
442 +static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc)
443 +{
444 + bool success;
445 +
446 + if (!sc->may_unmap && page_mapped(page))
447 + return false;
448 +
449 + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
450 + (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
451 + return false;
452 +
453 + if (!get_page_unless_zero(page))
454 + return false;
455 +
456 + if (!TestClearPageLRU(page)) {
457 + put_page(page);
458 + return false;
459 + }
460 +
461 + success = lru_gen_del_page(page, lruvec, true);
462 + VM_BUG_ON_PAGE(!success, page);
463 +
464 + return true;
465 +}
466 +
467 +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
468 + int type, int tier, struct list_head *list)
469 +{
470 + int gen, zone;
471 + enum vm_event_item item;
472 + int sorted = 0;
473 + int scanned = 0;
474 + int isolated = 0;
475 + int remaining = MAX_BATCH_SIZE;
476 + struct lrugen *lrugen = &lruvec->evictable;
477 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
478 +
479 + VM_BUG_ON(!list_empty(list));
480 +
481 + if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
482 + return 0;
483 +
484 + gen = lru_gen_from_seq(lrugen->min_seq[type]);
485 +
486 + for (zone = sc->reclaim_idx; zone >= 0; zone--) {
487 + LIST_HEAD(moved);
488 + int skipped = 0;
489 + struct list_head *head = &lrugen->lists[gen][type][zone];
490 +
491 + while (!list_empty(head)) {
492 + struct page *page = lru_to_page(head);
493 + int delta = thp_nr_pages(page);
494 +
495 + VM_BUG_ON_PAGE(PageTail(page), page);
496 + VM_BUG_ON_PAGE(PageUnevictable(page), page);
497 + VM_BUG_ON_PAGE(PageActive(page), page);
498 + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
499 + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
500 +
501 + prefetchw_prev_lru_page(page, head, flags);
502 +
503 + scanned += delta;
504 +
505 + if (sort_page(page, lruvec, tier))
506 + sorted += delta;
507 + else if (isolate_page(page, lruvec, sc)) {
508 + list_add(&page->lru, list);
509 + isolated += delta;
510 + } else {
511 + list_move(&page->lru, &moved);
512 + skipped += delta;
513 + }
514 +
515 + if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE)
516 + break;
517 + }
518 +
519 + if (skipped) {
520 + list_splice(&moved, head);
521 + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
522 + }
523 +
524 + if (!remaining || isolated >= MIN_BATCH_SIZE)
525 + break;
526 + }
527 +
528 + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
529 + if (!cgroup_reclaim(sc)) {
530 + __count_vm_events(item, isolated);
531 + __count_vm_events(PGREFILL, sorted);
532 + }
533 + __count_memcg_events(memcg, item, isolated);
534 + __count_memcg_events(memcg, PGREFILL, sorted);
535 + __count_vm_events(PGSCAN_ANON + type, isolated);
536 +
537 + /*
538 + * We may have trouble finding eligible pages due to reclaim_idx,
539 + * may_unmap and may_writepage. Check `remaining` to make sure we won't
540 + * be stuck if we aren't making enough progress.
541 + */
542 + return isolated || !remaining ? scanned : 0;
543 +}
544 +
545 +static int get_tier_idx(struct lruvec *lruvec, int type)
546 +{
547 + int tier;
548 + struct ctrl_pos sp, pv;
549 +
550 + /*
551 + * Ideally we don't want to evict upper tiers that have higher refaulted
552 + * %. However, we need to leave a margin for the fluctuation in
553 + * refaulted %. So we use a larger gain factor to make sure upper tiers
554 + * are indeed more active. We choose 2 because the lowest upper tier
555 + * would have twice of refaulted % of the base tier, according to their
556 + * numbers of accesses.
557 + */
558 + read_ctrl_pos(lruvec, type, 0, 1, &sp);
559 + for (tier = 1; tier < MAX_NR_TIERS; tier++) {
560 + read_ctrl_pos(lruvec, type, tier, 2, &pv);
561 + if (!positive_ctrl_err(&sp, &pv))
562 + break;
563 + }
564 +
565 + return tier - 1;
566 +}
567 +
568 +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
569 +{
570 + int type, tier;
571 + struct ctrl_pos sp, pv;
572 + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
573 +
574 + /*
575 + * Compare refaulted % between the base tiers of anon and file to
576 + * determine which type to evict. Also need to compare refaulted % of
577 + * the upper tiers of the selected type with that of the base tier of
578 + * the other type to determine which tier of the selected type to evict.
579 + */
580 + read_ctrl_pos(lruvec, 0, 0, gain[0], &sp);
581 + read_ctrl_pos(lruvec, 1, 0, gain[1], &pv);
582 + type = positive_ctrl_err(&sp, &pv);
583 +
584 + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
585 + for (tier = 1; tier < MAX_NR_TIERS; tier++) {
586 + read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
587 + if (!positive_ctrl_err(&sp, &pv))
588 + break;
589 + }
590 +
591 + *tier_idx = tier - 1;
592 +
593 + return type;
594 +}
595 +
596 +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
597 + int *type_scanned, struct list_head *list)
598 +{
599 + int i;
600 + int type;
601 + int scanned;
602 + int tier = -1;
603 + DEFINE_MIN_SEQ(lruvec);
604 +
605 + VM_BUG_ON(!seq_is_valid(lruvec));
606 +
607 + /*
608 + * Try to select a type based on generations and swappiness, and if that
609 + * fails, fall back to get_type_to_scan(). When anon and file are both
610 + * available from the same generation, swappiness 200 is interpreted as
611 + * anon first and swappiness 1 is interpreted as file first.
612 + */
613 + if (!swappiness)
614 + type = 1;
615 + else if (min_seq[0] < min_seq[1])
616 + type = 0;
617 + else if (swappiness == 1)
618 + type = 1;
619 + else if (swappiness == 200)
620 + type = 0;
621 + else
622 + type = get_type_to_scan(lruvec, swappiness, &tier);
623 +
624 + for (i = !swappiness; i < ANON_AND_FILE; i++) {
625 + if (tier < 0)
626 + tier = get_tier_idx(lruvec, type);
627 +
628 + scanned = scan_pages(lruvec, sc, type, tier, list);
629 + if (scanned)
630 + break;
631 +
632 + type = !type;
633 + tier = -1;
634 + }
635 +
636 + *type_scanned = type;
637 +
638 + return scanned;
639 +}
640 +
641 +/* Main function used by the foreground, the background and the user-triggered eviction. */
642 +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
643 +{
644 + int type;
645 + int scanned;
646 + int reclaimed;
647 + LIST_HEAD(list);
648 + struct page *page;
649 + enum vm_event_item item;
650 + struct reclaim_stat stat;
651 + struct mm_walk_args *args;
652 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
653 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
654 +
655 + spin_lock_irq(&lruvec->lru_lock);
656 +
657 + scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
658 +
659 + if (try_to_inc_min_seq(lruvec, swappiness))
660 + scanned++;
661 +
662 + if (get_nr_gens(lruvec, 1) == MIN_NR_GENS)
663 + scanned = 0;
664 +
665 + spin_unlock_irq(&lruvec->lru_lock);
666 +
667 + if (list_empty(&list))
668 + return scanned;
669 +
670 + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
671 + /*
672 + * We need to prevent rejected pages from being added back to the same
673 + * lists they were isolated from. Otherwise we may risk looping on them
674 + * forever.
675 + */
676 + list_for_each_entry(page, &list, lru) {
677 + if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page)))
678 + SetPageActive(page);
679 +
680 + ClearPageReferenced(page);
681 + ClearPageWorkingset(page);
682 + }
683 +
684 + spin_lock_irq(&lruvec->lru_lock);
685 +
686 + move_pages_to_lru(lruvec, &list);
687 +
688 + args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
689 + if (args && args->batch_size)
690 + reset_batch_size(lruvec, args);
691 +
692 + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
693 + if (!cgroup_reclaim(sc))
694 + __count_vm_events(item, reclaimed);
695 + __count_memcg_events(memcg, item, reclaimed);
696 + __count_vm_events(PGSTEAL_ANON + type, reclaimed);
697 +
698 + spin_unlock_irq(&lruvec->lru_lock);
699 +
700 + mem_cgroup_uncharge_list(&list);
701 + free_unref_page_list(&list);
702 +
703 + sc->nr_reclaimed += reclaimed;
704 +
705 + return scanned;
706 +}
707 +
708 +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
709 +{
710 + bool low;
711 + long nr_to_scan;
712 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
713 + int priority = sc->priority;
714 + DEFINE_MAX_SEQ(lruvec);
715 + DEFINE_MIN_SEQ(lruvec);
716 +
717 + if (mem_cgroup_below_min(memcg) ||
718 + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
719 + return 0;
720 +
721 + if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
722 + priority = DEF_PRIORITY;
723 + sc->force_deactivate = 0;
724 + }
725 +
726 + nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
727 + if (!nr_to_scan)
728 + return 0;
729 +
730 + nr_to_scan >>= priority;
731 +
732 + if (!mem_cgroup_online(memcg))
733 + nr_to_scan++;
734 +
735 + if (!nr_to_scan)
736 + return 0;
737 +
738 + if (current_is_kswapd()) {
739 + /* leave the work to lru_gen_age_node() */
740 + if (max_seq - min_seq[1] < MIN_NR_GENS)
741 + return 0;
742 +
743 + if (!low)
744 + sc->force_deactivate = 0;
745 +
746 + return nr_to_scan;
747 + }
748 +
749 + if (max_seq - min_seq[1] >= MIN_NR_GENS)
750 + return nr_to_scan;
751 +
752 + /* move onto slab and other memcgs if we haven't tried them all */
753 + if (!sc->force_deactivate) {
754 + sc->skipped_deactivate = 1;
755 + return 0;
756 + }
757 +
758 + return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0;
759 +}
760 +
761 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
762 +{
763 + struct blk_plug plug;
764 + long scanned = 0;
765 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
766 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
767 +
768 + lru_add_drain();
769 +
770 + if (current_is_kswapd())
771 + current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
772 +
773 + blk_start_plug(&plug);
774 +
775 + while (true) {
776 + int delta;
777 + int swappiness;
778 + long nr_to_scan;
779 +
780 + if (sc->may_swap)
781 + swappiness = get_swappiness(memcg);
782 + else if (!cgroup_reclaim(sc) && get_swappiness(memcg))
783 + swappiness = 1;
784 + else
785 + swappiness = 0;
786 +
787 + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
788 + if (!nr_to_scan)
789 + break;
790 +
791 + delta = evict_pages(lruvec, sc, swappiness);
792 + if (!delta)
793 + break;
794 +
795 + scanned += delta;
796 + if (scanned >= nr_to_scan)
797 + break;
798 +
799 + cond_resched();
800 + }
801 +
802 + blk_finish_plug(&plug);
803 +
804 + if (current_is_kswapd())
805 + current->reclaim_state->mm_walk_args = NULL;
806 +}
807 +
808 +/******************************************************************************
809 * state change
810 ******************************************************************************/
811
812 @@ -4355,6 +4896,10 @@ static void lru_gen_age_node(struct pgli
813 {
814 }
815
816 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
817 +{
818 +}
819 +
820 #endif /* CONFIG_LRU_GEN */
821
822 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
823 @@ -4368,6 +4913,11 @@ static void shrink_lruvec(struct lruvec
824 bool proportional_reclaim;
825 struct blk_plug plug;
826
827 + if (lru_gen_enabled()) {
828 + lru_gen_shrink_lruvec(lruvec, sc);
829 + return;
830 + }
831 +
832 get_scan_count(lruvec, sc, nr);
833
834 /* Record the original scan target for proportional adjustments later */
835 @@ -4839,6 +5389,9 @@ static void snapshot_refaults(struct mem
836 struct lruvec *target_lruvec;
837 unsigned long refaults;
838
839 + if (lru_gen_enabled())
840 + return;
841 +
842 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
843 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
844 target_lruvec->refaults[0] = refaults;
845 --- a/mm/workingset.c
846 +++ b/mm/workingset.c
847 @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
848 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
849 bool workingset)
850 {
851 - eviction >>= bucket_order;
852 eviction &= EVICTION_MASK;
853 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
854 eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
855 @@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow,
856
857 *memcgidp = memcgid;
858 *pgdat = NODE_DATA(nid);
859 - *evictionp = entry << bucket_order;
860 + *evictionp = entry;
861 *workingsetp = workingset;
862 }
863
864 +#ifdef CONFIG_LRU_GEN
865 +
866 +static int page_lru_refs(struct page *page)
867 +{
868 + unsigned long flags = READ_ONCE(page->flags);
869 +
870 + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
871 +
872 + /* see the comment on MAX_NR_TIERS */
873 + return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0;
874 +}
875 +
876 +/* Return a token to be stored in the shadow entry of a page being evicted. */
877 +static void *lru_gen_eviction(struct page *page)
878 +{
879 + int hist, tier;
880 + unsigned long token;
881 + unsigned long min_seq;
882 + struct lruvec *lruvec;
883 + struct lrugen *lrugen;
884 + int type = page_is_file_lru(page);
885 + int refs = page_lru_refs(page);
886 + int delta = thp_nr_pages(page);
887 + bool workingset = PageWorkingset(page);
888 + struct mem_cgroup *memcg = page_memcg(page);
889 + struct pglist_data *pgdat = page_pgdat(page);
890 +
891 + lruvec = mem_cgroup_lruvec(memcg, pgdat);
892 + lrugen = &lruvec->evictable;
893 + min_seq = READ_ONCE(lrugen->min_seq[type]);
894 + token = (min_seq << LRU_REFS_WIDTH) | refs;
895 +
896 + hist = lru_hist_from_seq(min_seq);
897 + tier = lru_tier_from_refs(refs + workingset);
898 + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
899 +
900 + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
901 +}
902 +
903 +/* Count a refaulted page based on the token stored in its shadow entry. */
904 +static void lru_gen_refault(struct page *page, void *shadow)
905 +{
906 + int hist, tier, refs;
907 + int memcg_id;
908 + bool workingset;
909 + unsigned long token;
910 + unsigned long min_seq;
911 + struct lruvec *lruvec;
912 + struct lrugen *lrugen;
913 + struct mem_cgroup *memcg;
914 + struct pglist_data *pgdat;
915 + int type = page_is_file_lru(page);
916 + int delta = thp_nr_pages(page);
917 +
918 + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
919 + if (page_pgdat(page) != pgdat)
920 + return;
921 +
922 + rcu_read_lock();
923 + memcg = page_memcg_rcu(page);
924 + if (mem_cgroup_id(memcg) != memcg_id)
925 + goto unlock;
926 +
927 + refs = token & (BIT(LRU_REFS_WIDTH) - 1);
928 + if (refs && !workingset)
929 + goto unlock;
930 +
931 + token >>= LRU_REFS_WIDTH;
932 + lruvec = mem_cgroup_lruvec(memcg, pgdat);
933 + lrugen = &lruvec->evictable;
934 + min_seq = READ_ONCE(lrugen->min_seq[type]);
935 + if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
936 + goto unlock;
937 +
938 + hist = lru_hist_from_seq(min_seq);
939 + tier = lru_tier_from_refs(refs + workingset);
940 + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
941 + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
942 +
943 + /*
944 + * Tiers don't offer any protection to pages accessed via page tables.
945 + * That's what generations do. Tiers can't fully protect pages after
946 + * their numbers of accesses has exceeded the max value. Conservatively
947 + * count these two conditions as stalls even though they might not
948 + * indicate any real memory pressure.
949 + */
950 + if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) {
951 + SetPageWorkingset(page);
952 + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
953 + }
954 +unlock:
955 + rcu_read_unlock();
956 +}
957 +
958 +#else
959 +
960 +static void *lru_gen_eviction(struct page *page)
961 +{
962 + return NULL;
963 +}
964 +
965 +static void lru_gen_refault(struct page *page, void *shadow)
966 +{
967 +}
968 +
969 +#endif /* CONFIG_LRU_GEN */
970 +
971 /**
972 * workingset_age_nonresident - age non-resident entries as LRU ages
973 * @lruvec: the lruvec that was aged
974 @@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p
975 VM_BUG_ON_PAGE(page_count(page), page);
976 VM_BUG_ON_PAGE(!PageLocked(page), page);
977
978 + if (lru_gen_enabled())
979 + return lru_gen_eviction(page);
980 +
981 lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
982 /* XXX: target_memcg can be NULL, go through lruvec */
983 memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
984 eviction = atomic_long_read(&lruvec->nonresident_age);
985 + eviction >>= bucket_order;
986 workingset_age_nonresident(lruvec, thp_nr_pages(page));
987 return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
988 }
989 @@ -296,7 +406,13 @@ void workingset_refault(struct page *pag
990 bool workingset;
991 int memcgid;
992
993 + if (lru_gen_enabled()) {
994 + lru_gen_refault(page, shadow);
995 + return;
996 + }
997 +
998 unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
999 + eviction <<= bucket_order;
1000
1001 rcu_read_lock();
1002 /*