target/linux/generic/backport-5.15/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch

   1 From b564b9471cd60ef1ee3961a224898ce4a9620d84 Mon Sep 17 00:00:00 2001
   2 From: Yu Zhao <yuzhao@google.com>
   3 Date: Sun, 18 Sep 2022 02:00:03 -0600
   4 Subject: [PATCH 06/29] mm: multi-gen LRU: minimal implementation
   5 MIME-Version: 1.0
   6 Content-Type: text/plain; charset=UTF-8
   7 Content-Transfer-Encoding: 8bit
   8
   9 To avoid confusion, the terms "promotion" and "demotion" will be applied
  10 to the multi-gen LRU, as a new convention; the terms "activation" and
  11 "deactivation" will be applied to the active/inactive LRU, as usual.
  12
  13 The aging produces young generations.  Given an lruvec, it increments
  14 max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS.  The aging promotes
  15 hot pages to the youngest generation when it finds them accessed through
  16 page tables; the demotion of cold pages happens consequently when it
  17 increments max_seq.  Promotion in the aging path does not involve any LRU
  18 list operations, only the updates of the gen counter and
  19 lrugen->nr_pages[]; demotion, unless as the result of the increment of
  20 max_seq, requires LRU list operations, e.g., lru_deactivate_fn().  The
  21 aging has the complexity O(nr_hot_pages), since it is only interested in
  22 hot pages.
  23
  24 The eviction consumes old generations.  Given an lruvec, it increments
  25 min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes empty.
  26 A feedback loop modeled after the PID controller monitors refaults over
  27 anon and file types and decides which type to evict when both types are
  28 available from the same generation.
  29
  30 The protection of pages accessed multiple times through file descriptors
  31 takes place in the eviction path.  Each generation is divided into
  32 multiple tiers.  A page accessed N times through file descriptors is in
  33 tier order_base_2(N).  Tiers do not have dedicated lrugen->lists[], only
  34 bits in page->flags.  The aforementioned feedback loop also monitors
  35 refaults over all tiers and decides when to protect pages in which tiers
  36 (N>1), using the first tier (N=0,1) as a baseline.  The first tier
  37 contains single-use unmapped clean pages, which are most likely the best
  38 choices.  In contrast to promotion in the aging path, the protection of a
  39 page in the eviction path is achieved by moving this page to the next
  40 generation, i.e., min_seq+1, if the feedback loop decides so.  This
  41 approach has the following advantages:
  42
  43 1. It removes the cost of activation in the buffered access path by
  44    inferring whether pages accessed multiple times through file
  45    descriptors are statistically hot and thus worth protecting in the
  46    eviction path.
  47 2. It takes pages accessed through page tables into account and avoids
  48    overprotecting pages accessed multiple times through file
  49    descriptors. (Pages accessed through page tables are in the first
  50    tier, since N=0.)
  51 3. More tiers provide better protection for pages accessed more than
  52    twice through file descriptors, when under heavy buffered I/O
  53    workloads.
  54
  55 Server benchmark results:
  56   Single workload:
  57     fio (buffered I/O): +[30, 32]%
  58                 IOPS         BW
  59       5.19-rc1: 2673k        10.2GiB/s
  60       patch1-6: 3491k        13.3GiB/s
  61
  62   Single workload:
  63     memcached (anon): -[4, 6]%
  64                 Ops/sec      KB/sec
  65       5.19-rc1: 1161501.04   45177.25
  66       patch1-6: 1106168.46   43025.04
  67
  68   Configurations:
  69     CPU: two Xeon 6154
  70     Mem: total 256G
  71
  72     Node 1 was only used as a ram disk to reduce the variance in the
  73     results.
  74
  75     patch drivers/block/brd.c <<EOF
  76     99,100c99,100
  77     <   gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
  78     <   page = alloc_page(gfp_flags);
  79     ---
  80     >   gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE;
  81     >   page = alloc_pages_node(1, gfp_flags, 0);
  82     EOF
  83
  84     cat >>/etc/systemd/system.conf <<EOF
  85     CPUAffinity=numa
  86     NUMAPolicy=bind
  87     NUMAMask=0
  88     EOF
  89
  90     cat >>/etc/memcached.conf <<EOF
  91     -m 184320
  92     -s /var/run/memcached/memcached.sock
  93     -a 0766
  94     -t 36
  95     -B binary
  96     EOF
  97
  98     cat fio.sh
  99     modprobe brd rd_nr=1 rd_size=113246208
 100     swapoff -a
 101     mkfs.ext4 /dev/ram0
 102     mount -t ext4 /dev/ram0 /mnt
 103
 104     mkdir /sys/fs/cgroup/user.slice/test
 105     echo 38654705664 >/sys/fs/cgroup/user.slice/test/memory.max
 106     echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs
 107     fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \
 108       --buffered=1 --ioengine=io_uring --iodepth=128 \
 109       --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
 110       --rw=randread --random_distribution=random --norandommap \
 111       --time_based --ramp_time=10m --runtime=5m --group_reporting
 112
 113     cat memcached.sh
 114     modprobe brd rd_nr=1 rd_size=113246208
 115     swapoff -a
 116     mkswap /dev/ram0
 117     swapon /dev/ram0
 118
 119     memtier_benchmark -S /var/run/memcached/memcached.sock \
 120       -P memcache_binary -n allkeys --key-minimum=1 \
 121       --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \
 122       --ratio 1:0 --pipeline 8 -d 2000
 123
 124     memtier_benchmark -S /var/run/memcached/memcached.sock \
 125       -P memcache_binary -n allkeys --key-minimum=1 \
 126       --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \
 127       --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
 128
 129 Client benchmark results:
 130   kswapd profiles:
 131     5.19-rc1
 132       40.33%  page_vma_mapped_walk (overhead)
 133       21.80%  lzo1x_1_do_compress (real work)
 134        7.53%  do_raw_spin_lock
 135        3.95%  _raw_spin_unlock_irq
 136        2.52%  vma_interval_tree_iter_next
 137        2.37%  page_referenced_one
 138        2.28%  vma_interval_tree_subtree_search
 139        1.97%  anon_vma_interval_tree_iter_first
 140        1.60%  ptep_clear_flush
 141        1.06%  __zram_bvec_write
 142
 143     patch1-6
 144       39.03%  lzo1x_1_do_compress (real work)
 145       18.47%  page_vma_mapped_walk (overhead)
 146        6.74%  _raw_spin_unlock_irq
 147        3.97%  do_raw_spin_lock
 148        2.49%  ptep_clear_flush
 149        2.48%  anon_vma_interval_tree_iter_first
 150        1.92%  page_referenced_one
 151        1.88%  __zram_bvec_write
 152        1.48%  memmove
 153        1.31%  vma_interval_tree_iter_next
 154
 155   Configurations:
 156     CPU: single Snapdragon 7c
 157     Mem: total 4G
 158
 159     ChromeOS MemoryPressure [1]
 160
 161 [1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/
 162
 163 Link: https://lkml.kernel.org/r/20220918080010.2920238-7-yuzhao@google.com
 164 Signed-off-by: Yu Zhao <yuzhao@google.com>
 165 Acked-by: Brian Geffon <bgeffon@google.com>
 166 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 167 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 168 Acked-by: Steven Barrett <steven@liquorix.net>
 169 Acked-by: Suleiman Souhlal <suleiman@google.com>
 170 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 171 Tested-by: Donald Carr <d@chaos-reins.com>
 172 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 173 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 174 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 175 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 176 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 177 Cc: Andi Kleen <ak@linux.intel.com>
 178 Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
 179 Cc: Barry Song <baohua@kernel.org>
 180 Cc: Catalin Marinas <catalin.marinas@arm.com>
 181 Cc: Dave Hansen <dave.hansen@linux.intel.com>
 182 Cc: Hillf Danton <hdanton@sina.com>
 183 Cc: Jens Axboe <axboe@kernel.dk>
 184 Cc: Johannes Weiner <hannes@cmpxchg.org>
 185 Cc: Jonathan Corbet <corbet@lwn.net>
 186 Cc: Linus Torvalds <torvalds@linux-foundation.org>
 187 Cc: Matthew Wilcox <willy@infradead.org>
 188 Cc: Mel Gorman <mgorman@suse.de>
 189 Cc: Miaohe Lin <linmiaohe@huawei.com>
 190 Cc: Michael Larabel <Michael@MichaelLarabel.com>
 191 Cc: Michal Hocko <mhocko@kernel.org>
 192 Cc: Mike Rapoport <rppt@kernel.org>
 193 Cc: Mike Rapoport <rppt@linux.ibm.com>
 194 Cc: Peter Zijlstra <peterz@infradead.org>
 195 Cc: Qi Zheng <zhengqi.arch@bytedance.com>
 196 Cc: Tejun Heo <tj@kernel.org>
 197 Cc: Vlastimil Babka <vbabka@suse.cz>
 198 Cc: Will Deacon <will@kernel.org>
 199 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 200 ---
 201  include/linux/mm_inline.h         |  36 ++
 202  include/linux/mmzone.h            |  41 ++
 203  include/linux/page-flags-layout.h |   5 +-
 204  kernel/bounds.c                   |   2 +
 205  mm/Kconfig                        |  11 +
 206  mm/swap.c                         |  39 ++
 207  mm/vmscan.c                       | 792 +++++++++++++++++++++++++++++-
 208  mm/workingset.c                   | 110 ++++-
 209  8 files changed, 1025 insertions(+), 11 deletions(-)
 210
 211 --- a/include/linux/mm_inline.h
 212 +++ b/include/linux/mm_inline.h
 213 @@ -106,6 +106,33 @@ static inline int lru_gen_from_seq(unsig
 214         return seq % MAX_NR_GENS;
 215  }
 216
 217 +static inline int lru_hist_from_seq(unsigned long seq)
 218 +{
 219 +       return seq % NR_HIST_GENS;
 220 +}
 221 +
 222 +static inline int lru_tier_from_refs(int refs)
 223 +{
 224 +       VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
 225 +
 226 +       /* see the comment in page_lru_refs() */
 227 +       return order_base_2(refs + 1);
 228 +}
 229 +
 230 +static inline int page_lru_refs(struct page *page)
 231 +{
 232 +       unsigned long flags = READ_ONCE(page->flags);
 233 +       bool workingset = flags & BIT(PG_workingset);
 234 +
 235 +       /*
 236 +        * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
 237 +        * total number of accesses is N>1, since N=0,1 both map to the first
 238 +        * tier. lru_tier_from_refs() will account for this off-by-one. Also see
 239 +        * the comment on MAX_NR_TIERS.
 240 +        */
 241 +       return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
 242 +}
 243 +
 244  static inline int page_lru_gen(struct page *page)
 245  {
 246         unsigned long flags = READ_ONCE(page->flags);
 247 @@ -158,6 +185,15 @@ static inline void lru_gen_update_size(s
 248                 __update_lru_size(lruvec, lru, zone, -delta);
 249                 return;
 250         }
 251 +
 252 +       /* promotion */
 253 +       if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
 254 +               __update_lru_size(lruvec, lru, zone, -delta);
 255 +               __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
 256 +       }
 257 +
 258 +       /* demotion requires isolation, e.g., lru_deactivate_fn() */
 259 +       VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
 260  }
 261
 262  static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
 263 --- a/include/linux/mmzone.h
 264 +++ b/include/linux/mmzone.h
 265 @@ -327,6 +327,28 @@ enum lruvec_flags {
 266  #define MIN_NR_GENS            2U
 267  #define MAX_NR_GENS            4U
 268
 269 +/*
 270 + * Each generation is divided into multiple tiers. A page accessed N times
 271 + * through file descriptors is in tier order_base_2(N). A page in the first tier
 272 + * (N=0,1) is marked by PG_referenced unless it was faulted in through page
 273 + * tables or read ahead. A page in any other tier (N>1) is marked by
 274 + * PG_referenced and PG_workingset. This implies a minimum of two tiers is
 275 + * supported without using additional bits in page->flags.
 276 + *
 277 + * In contrast to moving across generations which requires the LRU lock, moving
 278 + * across tiers only involves atomic operations on page->flags and therefore
 279 + * has a negligible cost in the buffered access path. In the eviction path,
 280 + * comparisons of refaulted/(evicted+protected) from the first tier and the
 281 + * rest infer whether pages accessed multiple times through file descriptors
 282 + * are statistically hot and thus worth protecting.
 283 + *
 284 + * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
 285 + * number of categories of the active/inactive LRU when keeping track of
 286 + * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
 287 + * page->flags.
 288 + */
 289 +#define MAX_NR_TIERS           4U
 290 +
 291  #ifndef __GENERATING_BOUNDS_H
 292
 293  struct lruvec;
 294 @@ -341,6 +363,16 @@ enum {
 295         LRU_GEN_FILE,
 296  };
 297
 298 +#define MIN_LRU_BATCH          BITS_PER_LONG
 299 +#define MAX_LRU_BATCH          (MIN_LRU_BATCH * 64)
 300 +
 301 +/* whether to keep historical stats from evicted generations */
 302 +#ifdef CONFIG_LRU_GEN_STATS
 303 +#define NR_HIST_GENS           MAX_NR_GENS
 304 +#else
 305 +#define NR_HIST_GENS           1U
 306 +#endif
 307 +
 308  /*
 309   * The youngest generation number is stored in max_seq for both anon and file
 310   * types as they are aged on an equal footing. The oldest generation numbers are
 311 @@ -363,6 +395,15 @@ struct lru_gen_struct {
 312         struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 313         /* the multi-gen LRU sizes, eventually consistent */
 314         long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 315 +       /* the exponential moving average of refaulted */
 316 +       unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
 317 +       /* the exponential moving average of evicted+protected */
 318 +       unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
 319 +       /* the first tier doesn't need protection, hence the minus one */
 320 +       unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
 321 +       /* can be modified without holding the LRU lock */
 322 +       atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 323 +       atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 324  };
 325
 326  void lru_gen_init_lruvec(struct lruvec *lruvec);
 327 --- a/include/linux/page-flags-layout.h
 328 +++ b/include/linux/page-flags-layout.h
 329 @@ -106,7 +106,10 @@
 330  #error "Not enough bits in page flags"
 331  #endif
 332
 333 -#define LRU_REFS_WIDTH 0
 334 +/* see the comment on MAX_NR_TIERS */
 335 +#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
 336 +                           ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
 337 +                           NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
 338
 339  #endif
 340  #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
 341 --- a/kernel/bounds.c
 342 +++ b/kernel/bounds.c
 343 @@ -24,8 +24,10 @@ int main(void)
 344         DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
 345  #ifdef CONFIG_LRU_GEN
 346         DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
 347 +       DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
 348  #else
 349         DEFINE(LRU_GEN_WIDTH, 0);
 350 +       DEFINE(__LRU_REFS_WIDTH, 0);
 351  #endif
 352         /* End of constants */
 353
 354 --- a/mm/Kconfig
 355 +++ b/mm/Kconfig
 356 @@ -897,6 +897,7 @@ config IO_MAPPING
 357  config SECRETMEM
 358         def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
 359
 360 +# multi-gen LRU {
 361  config LRU_GEN
 362         bool "Multi-Gen LRU"
 363         depends on MMU
 364 @@ -905,6 +906,16 @@ config LRU_GEN
 365         help
 366           A high performance LRU implementation to overcommit memory.
 367
 368 +config LRU_GEN_STATS
 369 +       bool "Full stats for debugging"
 370 +       depends on LRU_GEN
 371 +       help
 372 +         Do not enable this option unless you plan to look at historical stats
 373 +         from evicted generations for debugging purpose.
 374 +
 375 +         This option has a per-memcg and per-node memory overhead.
 376 +# }
 377 +
 378  source "mm/damon/Kconfig"
 379
 380  endmenu
 381 --- a/mm/swap.c
 382 +++ b/mm/swap.c
 383 @@ -389,6 +389,40 @@ static void __lru_cache_activate_page(st
 384         local_unlock(&lru_pvecs.lock);
 385  }
 386
 387 +#ifdef CONFIG_LRU_GEN
 388 +static void page_inc_refs(struct page *page)
 389 +{
 390 +       unsigned long new_flags, old_flags = READ_ONCE(page->flags);
 391 +
 392 +       if (PageUnevictable(page))
 393 +               return;
 394 +
 395 +       if (!PageReferenced(page)) {
 396 +               SetPageReferenced(page);
 397 +               return;
 398 +       }
 399 +
 400 +       if (!PageWorkingset(page)) {
 401 +               SetPageWorkingset(page);
 402 +               return;
 403 +       }
 404 +
 405 +       /* see the comment on MAX_NR_TIERS */
 406 +       do {
 407 +               new_flags = old_flags & LRU_REFS_MASK;
 408 +               if (new_flags == LRU_REFS_MASK)
 409 +                       break;
 410 +
 411 +               new_flags += BIT(LRU_REFS_PGOFF);
 412 +               new_flags |= old_flags & ~LRU_REFS_MASK;
 413 +       } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
 414 +}
 415 +#else
 416 +static void page_inc_refs(struct page *page)
 417 +{
 418 +}
 419 +#endif /* CONFIG_LRU_GEN */
 420 +
 421  /*
 422   * Mark a page as having seen activity.
 423   *
 424 @@ -403,6 +437,11 @@ void mark_page_accessed(struct page *pag
 425  {
 426         page = compound_head(page);
 427
 428 +       if (lru_gen_enabled()) {
 429 +               page_inc_refs(page);
 430 +               return;
 431 +       }
 432 +
 433         if (!PageReferenced(page)) {
 434                 SetPageReferenced(page);
 435         } else if (PageUnevictable(page)) {
 436 --- a/mm/vmscan.c
 437 +++ b/mm/vmscan.c
 438 @@ -1142,9 +1142,11 @@ static int __remove_mapping(struct addre
 439
 440         if (PageSwapCache(page)) {
 441                 swp_entry_t swap = { .val = page_private(page) };
 442 -               mem_cgroup_swapout(page, swap);
 443 +
 444 +               /* get a shadow entry before mem_cgroup_swapout() clears page_memcg() */
 445                 if (reclaimed && !mapping_exiting(mapping))
 446                         shadow = workingset_eviction(page, target_memcg);
 447 +               mem_cgroup_swapout(page, swap);
 448                 __delete_from_swap_cache(page, swap, shadow);
 449                 xa_unlock_irq(&mapping->i_pages);
 450                 put_swap_page(page, swap);
 451 @@ -2502,6 +2504,9 @@ static void prepare_scan_count(pg_data_t
 452         unsigned long file;
 453         struct lruvec *target_lruvec;
 454
 455 +       if (lru_gen_enabled())
 456 +               return;
 457 +
 458         target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 459
 460         /*
 461 @@ -2827,6 +2832,17 @@ static bool can_age_anon_pages(struct pg
 462   *                          shorthand helpers
 463   ******************************************************************************/
 464
 465 +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
 466 +
 467 +#define DEFINE_MAX_SEQ(lruvec)                                         \
 468 +       unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
 469 +
 470 +#define DEFINE_MIN_SEQ(lruvec)                                         \
 471 +       unsigned long min_seq[ANON_AND_FILE] = {                        \
 472 +               READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),      \
 473 +               READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),      \
 474 +       }
 475 +
 476  #define for_each_gen_type_zone(gen, type, zone)                                \
 477         for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)                   \
 478                 for ((type) = 0; (type) < ANON_AND_FILE; (type)++)      \
 479 @@ -2852,6 +2868,745 @@ static struct lruvec __maybe_unused *get
 480         return pgdat ? &pgdat->__lruvec : NULL;
 481  }
 482
 483 +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
 484 +{
 485 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 486 +       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 487 +
 488 +       if (!can_demote(pgdat->node_id, sc) &&
 489 +           mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
 490 +               return 0;
 491 +
 492 +       return mem_cgroup_swappiness(memcg);
 493 +}
 494 +
 495 +static int get_nr_gens(struct lruvec *lruvec, int type)
 496 +{
 497 +       return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
 498 +}
 499 +
 500 +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
 501 +{
 502 +       /* see the comment on lru_gen_struct */
 503 +       return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
 504 +              get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
 505 +              get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
 506 +}
 507 +
 508 +/******************************************************************************
 509 + *                          refault feedback loop
 510 + ******************************************************************************/
 511 +
 512 +/*
 513 + * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
 514 + *
 515 + * The P term is refaulted/(evicted+protected) from a tier in the generation
 516 + * currently being evicted; the I term is the exponential moving average of the
 517 + * P term over the generations previously evicted, using the smoothing factor
 518 + * 1/2; the D term isn't supported.
 519 + *
 520 + * The setpoint (SP) is always the first tier of one type; the process variable
 521 + * (PV) is either any tier of the other type or any other tier of the same
 522 + * type.
 523 + *
 524 + * The error is the difference between the SP and the PV; the correction is to
 525 + * turn off protection when SP>PV or turn on protection when SP<PV.
 526 + *
 527 + * For future optimizations:
 528 + * 1. The D term may discount the other two terms over time so that long-lived
 529 + *    generations can resist stale information.
 530 + */
 531 +struct ctrl_pos {
 532 +       unsigned long refaulted;
 533 +       unsigned long total;
 534 +       int gain;
 535 +};
 536 +
 537 +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
 538 +                         struct ctrl_pos *pos)
 539 +{
 540 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 541 +       int hist = lru_hist_from_seq(lrugen->min_seq[type]);
 542 +
 543 +       pos->refaulted = lrugen->avg_refaulted[type][tier] +
 544 +                        atomic_long_read(&lrugen->refaulted[hist][type][tier]);
 545 +       pos->total = lrugen->avg_total[type][tier] +
 546 +                    atomic_long_read(&lrugen->evicted[hist][type][tier]);
 547 +       if (tier)
 548 +               pos->total += lrugen->protected[hist][type][tier - 1];
 549 +       pos->gain = gain;
 550 +}
 551 +
 552 +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
 553 +{
 554 +       int hist, tier;
 555 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 556 +       bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
 557 +       unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
 558 +
 559 +       lockdep_assert_held(&lruvec->lru_lock);
 560 +
 561 +       if (!carryover && !clear)
 562 +               return;
 563 +
 564 +       hist = lru_hist_from_seq(seq);
 565 +
 566 +       for (tier = 0; tier < MAX_NR_TIERS; tier++) {
 567 +               if (carryover) {
 568 +                       unsigned long sum;
 569 +
 570 +                       sum = lrugen->avg_refaulted[type][tier] +
 571 +                             atomic_long_read(&lrugen->refaulted[hist][type][tier]);
 572 +                       WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
 573 +
 574 +                       sum = lrugen->avg_total[type][tier] +
 575 +                             atomic_long_read(&lrugen->evicted[hist][type][tier]);
 576 +                       if (tier)
 577 +                               sum += lrugen->protected[hist][type][tier - 1];
 578 +                       WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
 579 +               }
 580 +
 581 +               if (clear) {
 582 +                       atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
 583 +                       atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
 584 +                       if (tier)
 585 +                               WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
 586 +               }
 587 +       }
 588 +}
 589 +
 590 +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
 591 +{
 592 +       /*
 593 +        * Return true if the PV has a limited number of refaults or a lower
 594 +        * refaulted/total than the SP.
 595 +        */
 596 +       return pv->refaulted < MIN_LRU_BATCH ||
 597 +              pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
 598 +              (sp->refaulted + 1) * pv->total * pv->gain;
 599 +}
 600 +
 601 +/******************************************************************************
 602 + *                          the aging
 603 + ******************************************************************************/
 604 +
 605 +/* protect pages accessed multiple times through file descriptors */
 606 +static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
 607 +{
 608 +       int type = page_is_file_lru(page);
 609 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 610 +       int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
 611 +       unsigned long new_flags, old_flags = READ_ONCE(page->flags);
 612 +
 613 +       VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
 614 +
 615 +       do {
 616 +               new_gen = (old_gen + 1) % MAX_NR_GENS;
 617 +
 618 +               new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
 619 +               new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
 620 +               /* for end_page_writeback() */
 621 +               if (reclaiming)
 622 +                       new_flags |= BIT(PG_reclaim);
 623 +       } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
 624 +
 625 +       lru_gen_update_size(lruvec, page, old_gen, new_gen);
 626 +
 627 +       return new_gen;
 628 +}
 629 +
 630 +static void inc_min_seq(struct lruvec *lruvec, int type)
 631 +{
 632 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 633 +
 634 +       reset_ctrl_pos(lruvec, type, true);
 635 +       WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
 636 +}
 637 +
 638 +static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
 639 +{
 640 +       int gen, type, zone;
 641 +       bool success = false;
 642 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 643 +       DEFINE_MIN_SEQ(lruvec);
 644 +
 645 +       VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
 646 +
 647 +       /* find the oldest populated generation */
 648 +       for (type = !can_swap; type < ANON_AND_FILE; type++) {
 649 +               while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
 650 +                       gen = lru_gen_from_seq(min_seq[type]);
 651 +
 652 +                       for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 653 +                               if (!list_empty(&lrugen->lists[gen][type][zone]))
 654 +                                       goto next;
 655 +                       }
 656 +
 657 +                       min_seq[type]++;
 658 +               }
 659 +next:
 660 +               ;
 661 +       }
 662 +
 663 +       /* see the comment on lru_gen_struct */
 664 +       if (can_swap) {
 665 +               min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
 666 +               min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
 667 +       }
 668 +
 669 +       for (type = !can_swap; type < ANON_AND_FILE; type++) {
 670 +               if (min_seq[type] == lrugen->min_seq[type])
 671 +                       continue;
 672 +
 673 +               reset_ctrl_pos(lruvec, type, true);
 674 +               WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
 675 +               success = true;
 676 +       }
 677 +
 678 +       return success;
 679 +}
 680 +
 681 +static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
 682 +{
 683 +       int prev, next;
 684 +       int type, zone;
 685 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 686 +
 687 +       spin_lock_irq(&lruvec->lru_lock);
 688 +
 689 +       VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
 690 +
 691 +       if (max_seq != lrugen->max_seq)
 692 +               goto unlock;
 693 +
 694 +       for (type = ANON_AND_FILE - 1; type >= 0; type--) {
 695 +               if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
 696 +                       continue;
 697 +
 698 +               VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
 699 +
 700 +               inc_min_seq(lruvec, type);
 701 +       }
 702 +
 703 +       /*
 704 +        * Update the active/inactive LRU sizes for compatibility. Both sides of
 705 +        * the current max_seq need to be covered, since max_seq+1 can overlap
 706 +        * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
 707 +        * overlap, cold/hot inversion happens.
 708 +        */
 709 +       prev = lru_gen_from_seq(lrugen->max_seq - 1);
 710 +       next = lru_gen_from_seq(lrugen->max_seq + 1);
 711 +
 712 +       for (type = 0; type < ANON_AND_FILE; type++) {
 713 +               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 714 +                       enum lru_list lru = type * LRU_INACTIVE_FILE;
 715 +                       long delta = lrugen->nr_pages[prev][type][zone] -
 716 +                                    lrugen->nr_pages[next][type][zone];
 717 +
 718 +                       if (!delta)
 719 +                               continue;
 720 +
 721 +                       __update_lru_size(lruvec, lru, zone, delta);
 722 +                       __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
 723 +               }
 724 +       }
 725 +
 726 +       for (type = 0; type < ANON_AND_FILE; type++)
 727 +               reset_ctrl_pos(lruvec, type, false);
 728 +
 729 +       /* make sure preceding modifications appear */
 730 +       smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
 731 +unlock:
 732 +       spin_unlock_irq(&lruvec->lru_lock);
 733 +}
 734 +
 735 +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
 736 +                            struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
 737 +{
 738 +       int gen, type, zone;
 739 +       unsigned long old = 0;
 740 +       unsigned long young = 0;
 741 +       unsigned long total = 0;
 742 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 743 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 744 +
 745 +       for (type = !can_swap; type < ANON_AND_FILE; type++) {
 746 +               unsigned long seq;
 747 +
 748 +               for (seq = min_seq[type]; seq <= max_seq; seq++) {
 749 +                       unsigned long size = 0;
 750 +
 751 +                       gen = lru_gen_from_seq(seq);
 752 +
 753 +                       for (zone = 0; zone < MAX_NR_ZONES; zone++)
 754 +                               size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
 755 +
 756 +                       total += size;
 757 +                       if (seq == max_seq)
 758 +                               young += size;
 759 +                       else if (seq + MIN_NR_GENS == max_seq)
 760 +                               old += size;
 761 +               }
 762 +       }
 763 +
 764 +       /* try to scrape all its memory if this memcg was deleted */
 765 +       *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
 766 +
 767 +       /*
 768 +        * The aging tries to be lazy to reduce the overhead, while the eviction
 769 +        * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
 770 +        * ideal number of generations is MIN_NR_GENS+1.
 771 +        */
 772 +       if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
 773 +               return true;
 774 +       if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
 775 +               return false;
 776 +
 777 +       /*
 778 +        * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
 779 +        * of the total number of pages for each generation. A reasonable range
 780 +        * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
 781 +        * aging cares about the upper bound of hot pages, while the eviction
 782 +        * cares about the lower bound of cold pages.
 783 +        */
 784 +       if (young * MIN_NR_GENS > total)
 785 +               return true;
 786 +       if (old * (MIN_NR_GENS + 2) < total)
 787 +               return true;
 788 +
 789 +       return false;
 790 +}
 791 +
 792 +static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 793 +{
 794 +       bool need_aging;
 795 +       unsigned long nr_to_scan;
 796 +       int swappiness = get_swappiness(lruvec, sc);
 797 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 798 +       DEFINE_MAX_SEQ(lruvec);
 799 +       DEFINE_MIN_SEQ(lruvec);
 800 +
 801 +       VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
 802 +
 803 +       mem_cgroup_calculate_protection(NULL, memcg);
 804 +
 805 +       if (mem_cgroup_below_min(memcg))
 806 +               return;
 807 +
 808 +       need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
 809 +       if (need_aging)
 810 +               inc_max_seq(lruvec, max_seq, swappiness);
 811 +}
 812 +
 813 +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 814 +{
 815 +       struct mem_cgroup *memcg;
 816 +
 817 +       VM_WARN_ON_ONCE(!current_is_kswapd());
 818 +
 819 +       memcg = mem_cgroup_iter(NULL, NULL, NULL);
 820 +       do {
 821 +               struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
 822 +
 823 +               age_lruvec(lruvec, sc);
 824 +
 825 +               cond_resched();
 826 +       } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
 827 +}
 828 +
 829 +/******************************************************************************
 830 + *                          the eviction
 831 + ******************************************************************************/
 832 +
 833 +static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx)
 834 +{
 835 +       bool success;
 836 +       int gen = page_lru_gen(page);
 837 +       int type = page_is_file_lru(page);
 838 +       int zone = page_zonenum(page);
 839 +       int delta = thp_nr_pages(page);
 840 +       int refs = page_lru_refs(page);
 841 +       int tier = lru_tier_from_refs(refs);
 842 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 843 +
 844 +       VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
 845 +
 846 +       /* unevictable */
 847 +       if (!page_evictable(page)) {
 848 +               success = lru_gen_del_page(lruvec, page, true);
 849 +               VM_WARN_ON_ONCE_PAGE(!success, page);
 850 +               SetPageUnevictable(page);
 851 +               add_page_to_lru_list(page, lruvec);
 852 +               __count_vm_events(UNEVICTABLE_PGCULLED, delta);
 853 +               return true;
 854 +       }
 855 +
 856 +       /* dirty lazyfree */
 857 +       if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) {
 858 +               success = lru_gen_del_page(lruvec, page, true);
 859 +               VM_WARN_ON_ONCE_PAGE(!success, page);
 860 +               SetPageSwapBacked(page);
 861 +               add_page_to_lru_list_tail(page, lruvec);
 862 +               return true;
 863 +       }
 864 +
 865 +       /* protected */
 866 +       if (tier > tier_idx) {
 867 +               int hist = lru_hist_from_seq(lrugen->min_seq[type]);
 868 +
 869 +               gen = page_inc_gen(lruvec, page, false);
 870 +               list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
 871 +
 872 +               WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
 873 +                          lrugen->protected[hist][type][tier - 1] + delta);
 874 +               __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
 875 +               return true;
 876 +       }
 877 +
 878 +       /* waiting for writeback */
 879 +       if (PageLocked(page) || PageWriteback(page) ||
 880 +           (type == LRU_GEN_FILE && PageDirty(page))) {
 881 +               gen = page_inc_gen(lruvec, page, true);
 882 +               list_move(&page->lru, &lrugen->lists[gen][type][zone]);
 883 +               return true;
 884 +       }
 885 +
 886 +       return false;
 887 +}
 888 +
 889 +static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc)
 890 +{
 891 +       bool success;
 892 +
 893 +       /* unmapping inhibited */
 894 +       if (!sc->may_unmap && page_mapped(page))
 895 +               return false;
 896 +
 897 +       /* swapping inhibited */
 898 +       if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
 899 +           (PageDirty(page) ||
 900 +            (PageAnon(page) && !PageSwapCache(page))))
 901 +               return false;
 902 +
 903 +       /* raced with release_pages() */
 904 +       if (!get_page_unless_zero(page))
 905 +               return false;
 906 +
 907 +       /* raced with another isolation */
 908 +       if (!TestClearPageLRU(page)) {
 909 +               put_page(page);
 910 +               return false;
 911 +       }
 912 +
 913 +       /* see the comment on MAX_NR_TIERS */
 914 +       if (!PageReferenced(page))
 915 +               set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
 916 +
 917 +       /* for shrink_page_list() */
 918 +       ClearPageReclaim(page);
 919 +       ClearPageReferenced(page);
 920 +
 921 +       success = lru_gen_del_page(lruvec, page, true);
 922 +       VM_WARN_ON_ONCE_PAGE(!success, page);
 923 +
 924 +       return true;
 925 +}
 926 +
 927 +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
 928 +                     int type, int tier, struct list_head *list)
 929 +{
 930 +       int gen, zone;
 931 +       enum vm_event_item item;
 932 +       int sorted = 0;
 933 +       int scanned = 0;
 934 +       int isolated = 0;
 935 +       int remaining = MAX_LRU_BATCH;
 936 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 937 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 938 +
 939 +       VM_WARN_ON_ONCE(!list_empty(list));
 940 +
 941 +       if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
 942 +               return 0;
 943 +
 944 +       gen = lru_gen_from_seq(lrugen->min_seq[type]);
 945 +
 946 +       for (zone = sc->reclaim_idx; zone >= 0; zone--) {
 947 +               LIST_HEAD(moved);
 948 +               int skipped = 0;
 949 +               struct list_head *head = &lrugen->lists[gen][type][zone];
 950 +
 951 +               while (!list_empty(head)) {
 952 +                       struct page *page = lru_to_page(head);
 953 +                       int delta = thp_nr_pages(page);
 954 +
 955 +                       VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
 956 +                       VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
 957 +                       VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
 958 +                       VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
 959 +
 960 +                       scanned += delta;
 961 +
 962 +                       if (sort_page(lruvec, page, tier))
 963 +                               sorted += delta;
 964 +                       else if (isolate_page(lruvec, page, sc)) {
 965 +                               list_add(&page->lru, list);
 966 +                               isolated += delta;
 967 +                       } else {
 968 +                               list_move(&page->lru, &moved);
 969 +                               skipped += delta;
 970 +                       }
 971 +
 972 +                       if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
 973 +                               break;
 974 +               }
 975 +
 976 +               if (skipped) {
 977 +                       list_splice(&moved, head);
 978 +                       __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
 979 +               }
 980 +
 981 +               if (!remaining || isolated >= MIN_LRU_BATCH)
 982 +                       break;
 983 +       }
 984 +
 985 +       item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
 986 +       if (!cgroup_reclaim(sc)) {
 987 +               __count_vm_events(item, isolated);
 988 +               __count_vm_events(PGREFILL, sorted);
 989 +       }
 990 +       __count_memcg_events(memcg, item, isolated);
 991 +       __count_memcg_events(memcg, PGREFILL, sorted);
 992 +       __count_vm_events(PGSCAN_ANON + type, isolated);
 993 +
 994 +       /*
 995 +        * There might not be eligible pages due to reclaim_idx, may_unmap and
 996 +        * may_writepage. Check the remaining to prevent livelock if it's not
 997 +        * making progress.
 998 +        */
 999 +       return isolated || !remaining ? scanned : 0;
1000 +}
1001 +
1002 +static int get_tier_idx(struct lruvec *lruvec, int type)
1003 +{
1004 +       int tier;
1005 +       struct ctrl_pos sp, pv;
1006 +
1007 +       /*
1008 +        * To leave a margin for fluctuations, use a larger gain factor (1:2).
1009 +        * This value is chosen because any other tier would have at least twice
1010 +        * as many refaults as the first tier.
1011 +        */
1012 +       read_ctrl_pos(lruvec, type, 0, 1, &sp);
1013 +       for (tier = 1; tier < MAX_NR_TIERS; tier++) {
1014 +               read_ctrl_pos(lruvec, type, tier, 2, &pv);
1015 +               if (!positive_ctrl_err(&sp, &pv))
1016 +                       break;
1017 +       }
1018 +
1019 +       return tier - 1;
1020 +}
1021 +
1022 +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
1023 +{
1024 +       int type, tier;
1025 +       struct ctrl_pos sp, pv;
1026 +       int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
1027 +
1028 +       /*
1029 +        * Compare the first tier of anon with that of file to determine which
1030 +        * type to scan. Also need to compare other tiers of the selected type
1031 +        * with the first tier of the other type to determine the last tier (of
1032 +        * the selected type) to evict.
1033 +        */
1034 +       read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
1035 +       read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
1036 +       type = positive_ctrl_err(&sp, &pv);
1037 +
1038 +       read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
1039 +       for (tier = 1; tier < MAX_NR_TIERS; tier++) {
1040 +               read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
1041 +               if (!positive_ctrl_err(&sp, &pv))
1042 +                       break;
1043 +       }
1044 +
1045 +       *tier_idx = tier - 1;
1046 +
1047 +       return type;
1048 +}
1049 +
1050 +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
1051 +                        int *type_scanned, struct list_head *list)
1052 +{
1053 +       int i;
1054 +       int type;
1055 +       int scanned;
1056 +       int tier = -1;
1057 +       DEFINE_MIN_SEQ(lruvec);
1058 +
1059 +       /*
1060 +        * Try to make the obvious choice first. When anon and file are both
1061 +        * available from the same generation, interpret swappiness 1 as file
1062 +        * first and 200 as anon first.
1063 +        */
1064 +       if (!swappiness)
1065 +               type = LRU_GEN_FILE;
1066 +       else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
1067 +               type = LRU_GEN_ANON;
1068 +       else if (swappiness == 1)
1069 +               type = LRU_GEN_FILE;
1070 +       else if (swappiness == 200)
1071 +               type = LRU_GEN_ANON;
1072 +       else
1073 +               type = get_type_to_scan(lruvec, swappiness, &tier);
1074 +
1075 +       for (i = !swappiness; i < ANON_AND_FILE; i++) {
1076 +               if (tier < 0)
1077 +                       tier = get_tier_idx(lruvec, type);
1078 +
1079 +               scanned = scan_pages(lruvec, sc, type, tier, list);
1080 +               if (scanned)
1081 +                       break;
1082 +
1083 +               type = !type;
1084 +               tier = -1;
1085 +       }
1086 +
1087 +       *type_scanned = type;
1088 +
1089 +       return scanned;
1090 +}
1091 +
1092 +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
1093 +{
1094 +       int type;
1095 +       int scanned;
1096 +       int reclaimed;
1097 +       LIST_HEAD(list);
1098 +       struct page *page;
1099 +       enum vm_event_item item;
1100 +       struct reclaim_stat stat;
1101 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
1102 +       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1103 +
1104 +       spin_lock_irq(&lruvec->lru_lock);
1105 +
1106 +       scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
1107 +
1108 +       scanned += try_to_inc_min_seq(lruvec, swappiness);
1109 +
1110 +       if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
1111 +               scanned = 0;
1112 +
1113 +       spin_unlock_irq(&lruvec->lru_lock);
1114 +
1115 +       if (list_empty(&list))
1116 +               return scanned;
1117 +
1118 +       reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
1119 +
1120 +       list_for_each_entry(page, &list, lru) {
1121 +               /* restore LRU_REFS_FLAGS cleared by isolate_page() */
1122 +               if (PageWorkingset(page))
1123 +                       SetPageReferenced(page);
1124 +
1125 +               /* don't add rejected pages to the oldest generation */
1126 +               if (PageReclaim(page) &&
1127 +                   (PageDirty(page) || PageWriteback(page)))
1128 +                       ClearPageActive(page);
1129 +               else
1130 +                       SetPageActive(page);
1131 +       }
1132 +
1133 +       spin_lock_irq(&lruvec->lru_lock);
1134 +
1135 +       move_pages_to_lru(lruvec, &list);
1136 +
1137 +       item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
1138 +       if (!cgroup_reclaim(sc))
1139 +               __count_vm_events(item, reclaimed);
1140 +       __count_memcg_events(memcg, item, reclaimed);
1141 +       __count_vm_events(PGSTEAL_ANON + type, reclaimed);
1142 +
1143 +       spin_unlock_irq(&lruvec->lru_lock);
1144 +
1145 +       mem_cgroup_uncharge_list(&list);
1146 +       free_unref_page_list(&list);
1147 +
1148 +       sc->nr_reclaimed += reclaimed;
1149 +
1150 +       return scanned;
1151 +}
1152 +
1153 +static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
1154 +                                   bool can_swap)
1155 +{
1156 +       bool need_aging;
1157 +       unsigned long nr_to_scan;
1158 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
1159 +       DEFINE_MAX_SEQ(lruvec);
1160 +       DEFINE_MIN_SEQ(lruvec);
1161 +
1162 +       if (mem_cgroup_below_min(memcg) ||
1163 +           (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
1164 +               return 0;
1165 +
1166 +       need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
1167 +       if (!need_aging)
1168 +               return nr_to_scan;
1169 +
1170 +       /* skip the aging path at the default priority */
1171 +       if (sc->priority == DEF_PRIORITY)
1172 +               goto done;
1173 +
1174 +       /* leave the work to lru_gen_age_node() */
1175 +       if (current_is_kswapd())
1176 +               return 0;
1177 +
1178 +       inc_max_seq(lruvec, max_seq, can_swap);
1179 +done:
1180 +       return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
1181 +}
1182 +
1183 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1184 +{
1185 +       struct blk_plug plug;
1186 +       unsigned long scanned = 0;
1187 +
1188 +       lru_add_drain();
1189 +
1190 +       blk_start_plug(&plug);
1191 +
1192 +       while (true) {
1193 +               int delta;
1194 +               int swappiness;
1195 +               unsigned long nr_to_scan;
1196 +
1197 +               if (sc->may_swap)
1198 +                       swappiness = get_swappiness(lruvec, sc);
1199 +               else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
1200 +                       swappiness = 1;
1201 +               else
1202 +                       swappiness = 0;
1203 +
1204 +               nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
1205 +               if (!nr_to_scan)
1206 +                       break;
1207 +
1208 +               delta = evict_pages(lruvec, sc, swappiness);
1209 +               if (!delta)
1210 +                       break;
1211 +
1212 +               scanned += delta;
1213 +               if (scanned >= nr_to_scan)
1214 +                       break;
1215 +
1216 +               cond_resched();
1217 +       }
1218 +
1219 +       blk_finish_plug(&plug);
1220 +}
1221 +
1222  /******************************************************************************
1223   *                          initialization
1224   ******************************************************************************/
1225 @@ -2894,6 +3649,16 @@ static int __init init_lru_gen(void)
1226  };
1227  late_initcall(init_lru_gen);
1228
1229 +#else /* !CONFIG_LRU_GEN */
1230 +
1231 +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
1232 +{
1233 +}
1234 +
1235 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1236 +{
1237 +}
1238 +
1239  #endif /* CONFIG_LRU_GEN */
1240
1241  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1242 @@ -2907,6 +3672,11 @@ static void shrink_lruvec(struct lruvec
1243         bool proportional_reclaim;
1244         struct blk_plug plug;
1245
1246 +       if (lru_gen_enabled()) {
1247 +               lru_gen_shrink_lruvec(lruvec, sc);
1248 +               return;
1249 +       }
1250 +
1251         get_scan_count(lruvec, sc, nr);
1252
1253         /* Record the original scan target for proportional adjustments later */
1254 @@ -3375,6 +4145,9 @@ static void snapshot_refaults(struct mem
1255         struct lruvec *target_lruvec;
1256         unsigned long refaults;
1257
1258 +       if (lru_gen_enabled())
1259 +               return;
1260 +
1261         target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
1262         refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
1263         target_lruvec->refaults[0] = refaults;
1264 @@ -3739,12 +4512,16 @@ unsigned long try_to_free_mem_cgroup_pag
1265  }
1266  #endif
1267
1268 -static void age_active_anon(struct pglist_data *pgdat,
1269 -                               struct scan_control *sc)
1270 +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
1271  {
1272         struct mem_cgroup *memcg;
1273         struct lruvec *lruvec;
1274
1275 +       if (lru_gen_enabled()) {
1276 +               lru_gen_age_node(pgdat, sc);
1277 +               return;
1278 +       }
1279 +
1280         if (!can_age_anon_pages(pgdat, sc))
1281                 return;
1282
1283 @@ -4061,12 +4838,11 @@ restart:
1284                 sc.may_swap = !nr_boost_reclaim;
1285
1286                 /*
1287 -                * Do some background aging of the anon list, to give
1288 -                * pages a chance to be referenced before reclaiming. All
1289 -                * pages are rotated regardless of classzone as this is
1290 -                * about consistent aging.
1291 +                * Do some background aging, to give pages a chance to be
1292 +                * referenced before reclaiming. All pages are rotated
1293 +                * regardless of classzone as this is about consistent aging.
1294                  */
1295 -               age_active_anon(pgdat, &sc);
1296 +               kswapd_age_node(pgdat, &sc);
1297
1298                 /*
1299                  * If we're getting trouble reclaiming, start doing writepage
1300 --- a/mm/workingset.c
1301 +++ b/mm/workingset.c
1302 @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
1303  static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
1304                          bool workingset)
1305  {
1306 -       eviction >>= bucket_order;
1307         eviction &= EVICTION_MASK;
1308         eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
1309         eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
1310 @@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow,
1311
1312         *memcgidp = memcgid;
1313         *pgdat = NODE_DATA(nid);
1314 -       *evictionp = entry << bucket_order;
1315 +       *evictionp = entry;
1316         *workingsetp = workingset;
1317  }
1318
1319 +#ifdef CONFIG_LRU_GEN
1320 +
1321 +static void *lru_gen_eviction(struct page *page)
1322 +{
1323 +       int hist;
1324 +       unsigned long token;
1325 +       unsigned long min_seq;
1326 +       struct lruvec *lruvec;
1327 +       struct lru_gen_struct *lrugen;
1328 +       int type = page_is_file_lru(page);
1329 +       int delta = thp_nr_pages(page);
1330 +       int refs = page_lru_refs(page);
1331 +       int tier = lru_tier_from_refs(refs);
1332 +       struct mem_cgroup *memcg = page_memcg(page);
1333 +       struct pglist_data *pgdat = page_pgdat(page);
1334 +
1335 +       BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
1336 +
1337 +       lruvec = mem_cgroup_lruvec(memcg, pgdat);
1338 +       lrugen = &lruvec->lrugen;
1339 +       min_seq = READ_ONCE(lrugen->min_seq[type]);
1340 +       token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
1341 +
1342 +       hist = lru_hist_from_seq(min_seq);
1343 +       atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
1344 +
1345 +       return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
1346 +}
1347 +
1348 +static void lru_gen_refault(struct page *page, void *shadow)
1349 +{
1350 +       int hist, tier, refs;
1351 +       int memcg_id;
1352 +       bool workingset;
1353 +       unsigned long token;
1354 +       unsigned long min_seq;
1355 +       struct lruvec *lruvec;
1356 +       struct lru_gen_struct *lrugen;
1357 +       struct mem_cgroup *memcg;
1358 +       struct pglist_data *pgdat;
1359 +       int type = page_is_file_lru(page);
1360 +       int delta = thp_nr_pages(page);
1361 +
1362 +       unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
1363 +
1364 +       if (pgdat != page_pgdat(page))
1365 +               return;
1366 +
1367 +       rcu_read_lock();
1368 +
1369 +       memcg = page_memcg_rcu(page);
1370 +       if (memcg_id != mem_cgroup_id(memcg))
1371 +               goto unlock;
1372 +
1373 +       lruvec = mem_cgroup_lruvec(memcg, pgdat);
1374 +       lrugen = &lruvec->lrugen;
1375 +
1376 +       min_seq = READ_ONCE(lrugen->min_seq[type]);
1377 +       if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
1378 +               goto unlock;
1379 +
1380 +       hist = lru_hist_from_seq(min_seq);
1381 +       /* see the comment in page_lru_refs() */
1382 +       refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
1383 +       tier = lru_tier_from_refs(refs);
1384 +
1385 +       atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
1386 +       mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
1387 +
1388 +       /*
1389 +        * Count the following two cases as stalls:
1390 +        * 1. For pages accessed through page tables, hotter pages pushed out
1391 +        *    hot pages which refaulted immediately.
1392 +        * 2. For pages accessed multiple times through file descriptors,
1393 +        *    numbers of accesses might have been out of the range.
1394 +        */
1395 +       if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
1396 +               SetPageWorkingset(page);
1397 +               mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
1398 +       }
1399 +unlock:
1400 +       rcu_read_unlock();
1401 +}
1402 +
1403 +#else /* !CONFIG_LRU_GEN */
1404 +
1405 +static void *lru_gen_eviction(struct page *page)
1406 +{
1407 +       return NULL;
1408 +}
1409 +
1410 +static void lru_gen_refault(struct page *page, void *shadow)
1411 +{
1412 +}
1413 +
1414 +#endif /* CONFIG_LRU_GEN */
1415 +
1416  /**
1417   * workingset_age_nonresident - age non-resident entries as LRU ages
1418   * @lruvec: the lruvec that was aged
1419 @@ -264,10 +360,14 @@ void *workingset_eviction(struct page *p
1420         VM_BUG_ON_PAGE(page_count(page), page);
1421         VM_BUG_ON_PAGE(!PageLocked(page), page);
1422
1423 +       if (lru_gen_enabled())
1424 +               return lru_gen_eviction(page);
1425 +
1426         lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
1427         /* XXX: target_memcg can be NULL, go through lruvec */
1428         memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
1429         eviction = atomic_long_read(&lruvec->nonresident_age);
1430 +       eviction >>= bucket_order;
1431         workingset_age_nonresident(lruvec, thp_nr_pages(page));
1432         return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
1433  }
1434 @@ -296,7 +396,13 @@ void workingset_refault(struct page *pag
1435         bool workingset;
1436         int memcgid;
1437
1438 +       if (lru_gen_enabled()) {
1439 +               lru_gen_refault(page, shadow);
1440 +               return;
1441 +       }
1442 +
1443         unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
1444 +       eviction <<= bucket_order;
1445
1446         rcu_read_lock();
1447         /*