target/linux/generic/backport-5.15/020-v6.1-08-mm-multi-gen-LRU-support-page-table-walks.patch

   1 From 05223c4e80b34e29f2255c04ffebc2c4475e7593 Mon Sep 17 00:00:00 2001
   2 From: Yu Zhao <yuzhao@google.com>
   3 Date: Sun, 18 Sep 2022 02:00:05 -0600
   4 Subject: [PATCH 08/29] mm: multi-gen LRU: support page table walks
   5 MIME-Version: 1.0
   6 Content-Type: text/plain; charset=UTF-8
   7 Content-Transfer-Encoding: 8bit
   8
   9 To further exploit spatial locality, the aging prefers to walk page tables
  10 to search for young PTEs and promote hot pages.  A kill switch will be
  11 added in the next patch to disable this behavior.  When disabled, the
  12 aging relies on the rmap only.
  13
  14 NB: this behavior has nothing similar with the page table scanning in the
  15 2.4 kernel [1], which searches page tables for old PTEs, adds cold pages
  16 to swapcache and unmaps them.
  17
  18 To avoid confusion, the term "iteration" specifically means the traversal
  19 of an entire mm_struct list; the term "walk" will be applied to page
  20 tables and the rmap, as usual.
  21
  22 An mm_struct list is maintained for each memcg, and an mm_struct follows
  23 its owner task to the new memcg when this task is migrated.  Given an
  24 lruvec, the aging iterates lruvec_memcg()->mm_list and calls
  25 walk_page_range() with each mm_struct on this list to promote hot pages
  26 before it increments max_seq.
  27
  28 When multiple page table walkers iterate the same list, each of them gets
  29 a unique mm_struct; therefore they can run concurrently.  Page table
  30 walkers ignore any misplaced pages, e.g., if an mm_struct was migrated,
  31 pages it left in the previous memcg will not be promoted when its current
  32 memcg is under reclaim.  Similarly, page table walkers will not promote
  33 pages from nodes other than the one under reclaim.
  34
  35 This patch uses the following optimizations when walking page tables:
  36 1. It tracks the usage of mm_struct's between context switches so that
  37    page table walkers can skip processes that have been sleeping since
  38    the last iteration.
  39 2. It uses generational Bloom filters to record populated branches so
  40    that page table walkers can reduce their search space based on the
  41    query results, e.g., to skip page tables containing mostly holes or
  42    misplaced pages.
  43 3. It takes advantage of the accessed bit in non-leaf PMD entries when
  44    CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
  45 4. It does not zigzag between a PGD table and the same PMD table
  46    spanning multiple VMAs. IOW, it finishes all the VMAs within the
  47    range of the same PMD table before it returns to a PGD table. This
  48    improves the cache performance for workloads that have large
  49    numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.
  50
  51 Server benchmark results:
  52   Single workload:
  53     fio (buffered I/O): no change
  54
  55   Single workload:
  56     memcached (anon): +[8, 10]%
  57                 Ops/sec      KB/sec
  58       patch1-7: 1147696.57   44640.29
  59       patch1-8: 1245274.91   48435.66
  60
  61   Configurations:
  62     no change
  63
  64 Client benchmark results:
  65   kswapd profiles:
  66     patch1-7
  67       48.16%  lzo1x_1_do_compress (real work)
  68        8.20%  page_vma_mapped_walk (overhead)
  69        7.06%  _raw_spin_unlock_irq
  70        2.92%  ptep_clear_flush
  71        2.53%  __zram_bvec_write
  72        2.11%  do_raw_spin_lock
  73        2.02%  memmove
  74        1.93%  lru_gen_look_around
  75        1.56%  free_unref_page_list
  76        1.40%  memset
  77
  78     patch1-8
  79       49.44%  lzo1x_1_do_compress (real work)
  80        6.19%  page_vma_mapped_walk (overhead)
  81        5.97%  _raw_spin_unlock_irq
  82        3.13%  get_pfn_page
  83        2.85%  ptep_clear_flush
  84        2.42%  __zram_bvec_write
  85        2.08%  do_raw_spin_lock
  86        1.92%  memmove
  87        1.44%  alloc_zspage
  88        1.36%  memset
  89
  90   Configurations:
  91     no change
  92
  93 Thanks to the following developers for their efforts [3].
  94   kernel test robot <lkp@intel.com>
  95
  96 [1] https://lwn.net/Articles/23732/
  97 [2] https://llvm.org/docs/ScudoHardenedAllocator.html
  98 [3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/
  99
 100 Link: https://lkml.kernel.org/r/20220918080010.2920238-9-yuzhao@google.com
 101 Signed-off-by: Yu Zhao <yuzhao@google.com>
 102 Acked-by: Brian Geffon <bgeffon@google.com>
 103 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 104 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 105 Acked-by: Steven Barrett <steven@liquorix.net>
 106 Acked-by: Suleiman Souhlal <suleiman@google.com>
 107 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 108 Tested-by: Donald Carr <d@chaos-reins.com>
 109 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 110 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 111 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 112 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 113 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 114 Cc: Andi Kleen <ak@linux.intel.com>
 115 Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
 116 Cc: Barry Song <baohua@kernel.org>
 117 Cc: Catalin Marinas <catalin.marinas@arm.com>
 118 Cc: Dave Hansen <dave.hansen@linux.intel.com>
 119 Cc: Hillf Danton <hdanton@sina.com>
 120 Cc: Jens Axboe <axboe@kernel.dk>
 121 Cc: Johannes Weiner <hannes@cmpxchg.org>
 122 Cc: Jonathan Corbet <corbet@lwn.net>
 123 Cc: Linus Torvalds <torvalds@linux-foundation.org>
 124 Cc: Matthew Wilcox <willy@infradead.org>
 125 Cc: Mel Gorman <mgorman@suse.de>
 126 Cc: Miaohe Lin <linmiaohe@huawei.com>
 127 Cc: Michael Larabel <Michael@MichaelLarabel.com>
 128 Cc: Michal Hocko <mhocko@kernel.org>
 129 Cc: Mike Rapoport <rppt@kernel.org>
 130 Cc: Mike Rapoport <rppt@linux.ibm.com>
 131 Cc: Peter Zijlstra <peterz@infradead.org>
 132 Cc: Qi Zheng <zhengqi.arch@bytedance.com>
 133 Cc: Tejun Heo <tj@kernel.org>
 134 Cc: Vlastimil Babka <vbabka@suse.cz>
 135 Cc: Will Deacon <will@kernel.org>
 136 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 137 ---
 138  fs/exec.c                  |    2 +
 139  include/linux/memcontrol.h |    5 +
 140  include/linux/mm_types.h   |   76 +++
 141  include/linux/mmzone.h     |   56 +-
 142  include/linux/swap.h       |    4 +
 143  kernel/exit.c              |    1 +
 144  kernel/fork.c              |    9 +
 145  kernel/sched/core.c        |    1 +
 146  mm/memcontrol.c            |   25 +
 147  mm/vmscan.c                | 1010 +++++++++++++++++++++++++++++++++++-
 148  10 files changed, 1172 insertions(+), 17 deletions(-)
 149
 150 --- a/fs/exec.c
 151 +++ b/fs/exec.c
 152 @@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
 153         active_mm = tsk->active_mm;
 154         tsk->active_mm = mm;
 155         tsk->mm = mm;
 156 +       lru_gen_add_mm(mm);
 157         /*
 158          * This prevents preemption while active_mm is being loaded and
 159          * it and mm are being updated, which could cause problems for
 160 @@ -1028,6 +1029,7 @@ static int exec_mmap(struct mm_struct *m
 161         tsk->mm->vmacache_seqnum = 0;
 162         vmacache_flush(tsk);
 163         task_unlock(tsk);
 164 +       lru_gen_use_mm(mm);
 165         if (old_mm) {
 166                 mmap_read_unlock(old_mm);
 167                 BUG_ON(active_mm != old_mm);
 168 --- a/include/linux/memcontrol.h
 169 +++ b/include/linux/memcontrol.h
 170 @@ -353,6 +353,11 @@ struct mem_cgroup {
 171         struct deferred_split deferred_split_queue;
 172  #endif
 173
 174 +#ifdef CONFIG_LRU_GEN
 175 +       /* per-memcg mm_struct list */
 176 +       struct lru_gen_mm_list mm_list;
 177 +#endif
 178 +
 179         struct mem_cgroup_per_node *nodeinfo[];
 180  };
 181
 182 --- a/include/linux/mm_types.h
 183 +++ b/include/linux/mm_types.h
 184 @@ -580,6 +580,22 @@ struct mm_struct {
 185  #ifdef CONFIG_IOMMU_SUPPORT
 186                 u32 pasid;
 187  #endif
 188 +#ifdef CONFIG_LRU_GEN
 189 +               struct {
 190 +                       /* this mm_struct is on lru_gen_mm_list */
 191 +                       struct list_head list;
 192 +                       /*
 193 +                        * Set when switching to this mm_struct, as a hint of
 194 +                        * whether it has been used since the last time per-node
 195 +                        * page table walkers cleared the corresponding bits.
 196 +                        */
 197 +                       unsigned long bitmap;
 198 +#ifdef CONFIG_MEMCG
 199 +                       /* points to the memcg of "owner" above */
 200 +                       struct mem_cgroup *memcg;
 201 +#endif
 202 +               } lru_gen;
 203 +#endif /* CONFIG_LRU_GEN */
 204         } __randomize_layout;
 205
 206         /*
 207 @@ -606,6 +622,66 @@ static inline cpumask_t *mm_cpumask(stru
 208         return (struct cpumask *)&mm->cpu_bitmap;
 209  }
 210
 211 +#ifdef CONFIG_LRU_GEN
 212 +
 213 +struct lru_gen_mm_list {
 214 +       /* mm_struct list for page table walkers */
 215 +       struct list_head fifo;
 216 +       /* protects the list above */
 217 +       spinlock_t lock;
 218 +};
 219 +
 220 +void lru_gen_add_mm(struct mm_struct *mm);
 221 +void lru_gen_del_mm(struct mm_struct *mm);
 222 +#ifdef CONFIG_MEMCG
 223 +void lru_gen_migrate_mm(struct mm_struct *mm);
 224 +#endif
 225 +
 226 +static inline void lru_gen_init_mm(struct mm_struct *mm)
 227 +{
 228 +       INIT_LIST_HEAD(&mm->lru_gen.list);
 229 +       mm->lru_gen.bitmap = 0;
 230 +#ifdef CONFIG_MEMCG
 231 +       mm->lru_gen.memcg = NULL;
 232 +#endif
 233 +}
 234 +
 235 +static inline void lru_gen_use_mm(struct mm_struct *mm)
 236 +{
 237 +       /*
 238 +        * When the bitmap is set, page reclaim knows this mm_struct has been
 239 +        * used since the last time it cleared the bitmap. So it might be worth
 240 +        * walking the page tables of this mm_struct to clear the accessed bit.
 241 +        */
 242 +       WRITE_ONCE(mm->lru_gen.bitmap, -1);
 243 +}
 244 +
 245 +#else /* !CONFIG_LRU_GEN */
 246 +
 247 +static inline void lru_gen_add_mm(struct mm_struct *mm)
 248 +{
 249 +}
 250 +
 251 +static inline void lru_gen_del_mm(struct mm_struct *mm)
 252 +{
 253 +}
 254 +
 255 +#ifdef CONFIG_MEMCG
 256 +static inline void lru_gen_migrate_mm(struct mm_struct *mm)
 257 +{
 258 +}
 259 +#endif
 260 +
 261 +static inline void lru_gen_init_mm(struct mm_struct *mm)
 262 +{
 263 +}
 264 +
 265 +static inline void lru_gen_use_mm(struct mm_struct *mm)
 266 +{
 267 +}
 268 +
 269 +#endif /* CONFIG_LRU_GEN */
 270 +
 271  struct mmu_gather;
 272  extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 273  extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
 274 --- a/include/linux/mmzone.h
 275 +++ b/include/linux/mmzone.h
 276 @@ -385,7 +385,7 @@ enum {
 277   * min_seq behind.
 278   *
 279   * The number of pages in each generation is eventually consistent and therefore
 280 - * can be transiently negative.
 281 + * can be transiently negative when reset_batch_size() is pending.
 282   */
 283  struct lru_gen_struct {
 284         /* the aging increments the youngest generation number */
 285 @@ -407,6 +407,53 @@ struct lru_gen_struct {
 286         atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 287  };
 288
 289 +enum {
 290 +       MM_LEAF_TOTAL,          /* total leaf entries */
 291 +       MM_LEAF_OLD,            /* old leaf entries */
 292 +       MM_LEAF_YOUNG,          /* young leaf entries */
 293 +       MM_NONLEAF_TOTAL,       /* total non-leaf entries */
 294 +       MM_NONLEAF_FOUND,       /* non-leaf entries found in Bloom filters */
 295 +       MM_NONLEAF_ADDED,       /* non-leaf entries added to Bloom filters */
 296 +       NR_MM_STATS
 297 +};
 298 +
 299 +/* double-buffering Bloom filters */
 300 +#define NR_BLOOM_FILTERS       2
 301 +
 302 +struct lru_gen_mm_state {
 303 +       /* set to max_seq after each iteration */
 304 +       unsigned long seq;
 305 +       /* where the current iteration continues (inclusive) */
 306 +       struct list_head *head;
 307 +       /* where the last iteration ended (exclusive) */
 308 +       struct list_head *tail;
 309 +       /* to wait for the last page table walker to finish */
 310 +       struct wait_queue_head wait;
 311 +       /* Bloom filters flip after each iteration */
 312 +       unsigned long *filters[NR_BLOOM_FILTERS];
 313 +       /* the mm stats for debugging */
 314 +       unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
 315 +       /* the number of concurrent page table walkers */
 316 +       int nr_walkers;
 317 +};
 318 +
 319 +struct lru_gen_mm_walk {
 320 +       /* the lruvec under reclaim */
 321 +       struct lruvec *lruvec;
 322 +       /* unstable max_seq from lru_gen_struct */
 323 +       unsigned long max_seq;
 324 +       /* the next address within an mm to scan */
 325 +       unsigned long next_addr;
 326 +       /* to batch promoted pages */
 327 +       int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 328 +       /* to batch the mm stats */
 329 +       int mm_stats[NR_MM_STATS];
 330 +       /* total batched items */
 331 +       int batched;
 332 +       bool can_swap;
 333 +       bool force_scan;
 334 +};
 335 +
 336  void lru_gen_init_lruvec(struct lruvec *lruvec);
 337  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 338
 339 @@ -457,6 +504,8 @@ struct lruvec {
 340  #ifdef CONFIG_LRU_GEN
 341         /* evictable pages divided into generations */
 342         struct lru_gen_struct           lrugen;
 343 +       /* to concurrently iterate lru_gen_mm_list */
 344 +       struct lru_gen_mm_state         mm_state;
 345  #endif
 346  #ifdef CONFIG_MEMCG
 347         struct pglist_data *pgdat;
 348 @@ -1042,6 +1091,11 @@ typedef struct pglist_data {
 349
 350         unsigned long           flags;
 351
 352 +#ifdef CONFIG_LRU_GEN
 353 +       /* kswap mm walk data */
 354 +       struct lru_gen_mm_walk  mm_walk;
 355 +#endif
 356 +
 357         ZONE_PADDING(_pad2_)
 358
 359         /* Per-node vmstats */
 360 --- a/include/linux/swap.h
 361 +++ b/include/linux/swap.h
 362 @@ -137,6 +137,10 @@ union swap_header {
 363   */
 364  struct reclaim_state {
 365         unsigned long reclaimed_slab;
 366 +#ifdef CONFIG_LRU_GEN
 367 +       /* per-thread mm walk data */
 368 +       struct lru_gen_mm_walk *mm_walk;
 369 +#endif
 370  };
 371
 372  #ifdef __KERNEL__
 373 --- a/kernel/exit.c
 374 +++ b/kernel/exit.c
 375 @@ -469,6 +469,7 @@ assign_new_owner:
 376                 goto retry;
 377         }
 378         WRITE_ONCE(mm->owner, c);
 379 +       lru_gen_migrate_mm(mm);
 380         task_unlock(c);
 381         put_task_struct(c);
 382  }
 383 --- a/kernel/fork.c
 384 +++ b/kernel/fork.c
 385 @@ -1091,6 +1091,7 @@ static struct mm_struct *mm_init(struct
 386                 goto fail_nocontext;
 387
 388         mm->user_ns = get_user_ns(user_ns);
 389 +       lru_gen_init_mm(mm);
 390         return mm;
 391
 392  fail_nocontext:
 393 @@ -1133,6 +1134,7 @@ static inline void __mmput(struct mm_str
 394         }
 395         if (mm->binfmt)
 396                 module_put(mm->binfmt->module);
 397 +       lru_gen_del_mm(mm);
 398         mmdrop(mm);
 399  }
 400
 401 @@ -2625,6 +2627,13 @@ pid_t kernel_clone(struct kernel_clone_a
 402                 get_task_struct(p);
 403         }
 404
 405 +       if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
 406 +               /* lock the task to synchronize with memcg migration */
 407 +               task_lock(p);
 408 +               lru_gen_add_mm(p->mm);
 409 +               task_unlock(p);
 410 +       }
 411 +
 412         wake_up_new_task(p);
 413
 414         /* forking complete and child started to run, tell ptracer */
 415 --- a/kernel/sched/core.c
 416 +++ b/kernel/sched/core.c
 417 @@ -5010,6 +5010,7 @@ context_switch(struct rq *rq, struct tas
 418                  * finish_task_switch()'s mmdrop().
 419                  */
 420                 switch_mm_irqs_off(prev->active_mm, next->mm, next);
 421 +               lru_gen_use_mm(next->mm);
 422
 423                 if (!prev->mm) {                        // from kernel
 424                         /* will mmdrop() in finish_task_switch(). */
 425 --- a/mm/memcontrol.c
 426 +++ b/mm/memcontrol.c
 427 @@ -6212,6 +6212,30 @@ static void mem_cgroup_move_task(void)
 428  }
 429  #endif
 430
 431 +#ifdef CONFIG_LRU_GEN
 432 +static void mem_cgroup_attach(struct cgroup_taskset *tset)
 433 +{
 434 +       struct task_struct *task;
 435 +       struct cgroup_subsys_state *css;
 436 +
 437 +       /* find the first leader if there is any */
 438 +       cgroup_taskset_for_each_leader(task, css, tset)
 439 +               break;
 440 +
 441 +       if (!task)
 442 +               return;
 443 +
 444 +       task_lock(task);
 445 +       if (task->mm && READ_ONCE(task->mm->owner) == task)
 446 +               lru_gen_migrate_mm(task->mm);
 447 +       task_unlock(task);
 448 +}
 449 +#else
 450 +static void mem_cgroup_attach(struct cgroup_taskset *tset)
 451 +{
 452 +}
 453 +#endif /* CONFIG_LRU_GEN */
 454 +
 455  static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
 456  {
 457         if (value == PAGE_COUNTER_MAX)
 458 @@ -6555,6 +6579,7 @@ struct cgroup_subsys memory_cgrp_subsys
 459         .css_reset = mem_cgroup_css_reset,
 460         .css_rstat_flush = mem_cgroup_css_rstat_flush,
 461         .can_attach = mem_cgroup_can_attach,
 462 +       .attach = mem_cgroup_attach,
 463         .cancel_attach = mem_cgroup_cancel_attach,
 464         .post_attach = mem_cgroup_move_task,
 465         .dfl_cftypes = memory_files,
 466 --- a/mm/vmscan.c
 467 +++ b/mm/vmscan.c
 468 @@ -50,6 +50,8 @@
 469  #include <linux/printk.h>
 470  #include <linux/dax.h>
 471  #include <linux/psi.h>
 472 +#include <linux/pagewalk.h>
 473 +#include <linux/shmem_fs.h>
 474
 475  #include <asm/tlbflush.h>
 476  #include <asm/div64.h>
 477 @@ -2853,7 +2855,7 @@ static bool can_age_anon_pages(struct pg
 478                 for ((type) = 0; (type) < ANON_AND_FILE; (type)++)      \
 479                         for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
 480
 481 -static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
 482 +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
 483  {
 484         struct pglist_data *pgdat = NODE_DATA(nid);
 485
 486 @@ -2899,6 +2901,371 @@ static bool __maybe_unused seq_is_valid(
 487  }
 488
 489  /******************************************************************************
 490 + *                          mm_struct list
 491 + ******************************************************************************/
 492 +
 493 +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
 494 +{
 495 +       static struct lru_gen_mm_list mm_list = {
 496 +               .fifo = LIST_HEAD_INIT(mm_list.fifo),
 497 +               .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
 498 +       };
 499 +
 500 +#ifdef CONFIG_MEMCG
 501 +       if (memcg)
 502 +               return &memcg->mm_list;
 503 +#endif
 504 +       VM_WARN_ON_ONCE(!mem_cgroup_disabled());
 505 +
 506 +       return &mm_list;
 507 +}
 508 +
 509 +void lru_gen_add_mm(struct mm_struct *mm)
 510 +{
 511 +       int nid;
 512 +       struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
 513 +       struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 514 +
 515 +       VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
 516 +#ifdef CONFIG_MEMCG
 517 +       VM_WARN_ON_ONCE(mm->lru_gen.memcg);
 518 +       mm->lru_gen.memcg = memcg;
 519 +#endif
 520 +       spin_lock(&mm_list->lock);
 521 +
 522 +       for_each_node_state(nid, N_MEMORY) {
 523 +               struct lruvec *lruvec = get_lruvec(memcg, nid);
 524 +
 525 +               if (!lruvec)
 526 +                       continue;
 527 +
 528 +               /* the first addition since the last iteration */
 529 +               if (lruvec->mm_state.tail == &mm_list->fifo)
 530 +                       lruvec->mm_state.tail = &mm->lru_gen.list;
 531 +       }
 532 +
 533 +       list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
 534 +
 535 +       spin_unlock(&mm_list->lock);
 536 +}
 537 +
 538 +void lru_gen_del_mm(struct mm_struct *mm)
 539 +{
 540 +       int nid;
 541 +       struct lru_gen_mm_list *mm_list;
 542 +       struct mem_cgroup *memcg = NULL;
 543 +
 544 +       if (list_empty(&mm->lru_gen.list))
 545 +               return;
 546 +
 547 +#ifdef CONFIG_MEMCG
 548 +       memcg = mm->lru_gen.memcg;
 549 +#endif
 550 +       mm_list = get_mm_list(memcg);
 551 +
 552 +       spin_lock(&mm_list->lock);
 553 +
 554 +       for_each_node(nid) {
 555 +               struct lruvec *lruvec = get_lruvec(memcg, nid);
 556 +
 557 +               if (!lruvec)
 558 +                       continue;
 559 +
 560 +               /* where the last iteration ended (exclusive) */
 561 +               if (lruvec->mm_state.tail == &mm->lru_gen.list)
 562 +                       lruvec->mm_state.tail = lruvec->mm_state.tail->next;
 563 +
 564 +               /* where the current iteration continues (inclusive) */
 565 +               if (lruvec->mm_state.head != &mm->lru_gen.list)
 566 +                       continue;
 567 +
 568 +               lruvec->mm_state.head = lruvec->mm_state.head->next;
 569 +               /* the deletion ends the current iteration */
 570 +               if (lruvec->mm_state.head == &mm_list->fifo)
 571 +                       WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
 572 +       }
 573 +
 574 +       list_del_init(&mm->lru_gen.list);
 575 +
 576 +       spin_unlock(&mm_list->lock);
 577 +
 578 +#ifdef CONFIG_MEMCG
 579 +       mem_cgroup_put(mm->lru_gen.memcg);
 580 +       mm->lru_gen.memcg = NULL;
 581 +#endif
 582 +}
 583 +
 584 +#ifdef CONFIG_MEMCG
 585 +void lru_gen_migrate_mm(struct mm_struct *mm)
 586 +{
 587 +       struct mem_cgroup *memcg;
 588 +       struct task_struct *task = rcu_dereference_protected(mm->owner, true);
 589 +
 590 +       VM_WARN_ON_ONCE(task->mm != mm);
 591 +       lockdep_assert_held(&task->alloc_lock);
 592 +
 593 +       /* for mm_update_next_owner() */
 594 +       if (mem_cgroup_disabled())
 595 +               return;
 596 +
 597 +       rcu_read_lock();
 598 +       memcg = mem_cgroup_from_task(task);
 599 +       rcu_read_unlock();
 600 +       if (memcg == mm->lru_gen.memcg)
 601 +               return;
 602 +
 603 +       VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
 604 +       VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
 605 +
 606 +       lru_gen_del_mm(mm);
 607 +       lru_gen_add_mm(mm);
 608 +}
 609 +#endif
 610 +
 611 +/*
 612 + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
 613 + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
 614 + * bits in a bitmap, k is the number of hash functions and n is the number of
 615 + * inserted items.
 616 + *
 617 + * Page table walkers use one of the two filters to reduce their search space.
 618 + * To get rid of non-leaf entries that no longer have enough leaf entries, the
 619 + * aging uses the double-buffering technique to flip to the other filter each
 620 + * time it produces a new generation. For non-leaf entries that have enough
 621 + * leaf entries, the aging carries them over to the next generation in
 622 + * walk_pmd_range(); the eviction also report them when walking the rmap
 623 + * in lru_gen_look_around().
 624 + *
 625 + * For future optimizations:
 626 + * 1. It's not necessary to keep both filters all the time. The spare one can be
 627 + *    freed after the RCU grace period and reallocated if needed again.
 628 + * 2. And when reallocating, it's worth scaling its size according to the number
 629 + *    of inserted entries in the other filter, to reduce the memory overhead on
 630 + *    small systems and false positives on large systems.
 631 + * 3. Jenkins' hash function is an alternative to Knuth's.
 632 + */
 633 +#define BLOOM_FILTER_SHIFT     15
 634 +
 635 +static inline int filter_gen_from_seq(unsigned long seq)
 636 +{
 637 +       return seq % NR_BLOOM_FILTERS;
 638 +}
 639 +
 640 +static void get_item_key(void *item, int *key)
 641 +{
 642 +       u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
 643 +
 644 +       BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
 645 +
 646 +       key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
 647 +       key[1] = hash >> BLOOM_FILTER_SHIFT;
 648 +}
 649 +
 650 +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
 651 +{
 652 +       unsigned long *filter;
 653 +       int gen = filter_gen_from_seq(seq);
 654 +
 655 +       filter = lruvec->mm_state.filters[gen];
 656 +       if (filter) {
 657 +               bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
 658 +               return;
 659 +       }
 660 +
 661 +       filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
 662 +                              __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
 663 +       WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
 664 +}
 665 +
 666 +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
 667 +{
 668 +       int key[2];
 669 +       unsigned long *filter;
 670 +       int gen = filter_gen_from_seq(seq);
 671 +
 672 +       filter = READ_ONCE(lruvec->mm_state.filters[gen]);
 673 +       if (!filter)
 674 +               return;
 675 +
 676 +       get_item_key(item, key);
 677 +
 678 +       if (!test_bit(key[0], filter))
 679 +               set_bit(key[0], filter);
 680 +       if (!test_bit(key[1], filter))
 681 +               set_bit(key[1], filter);
 682 +}
 683 +
 684 +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
 685 +{
 686 +       int key[2];
 687 +       unsigned long *filter;
 688 +       int gen = filter_gen_from_seq(seq);
 689 +
 690 +       filter = READ_ONCE(lruvec->mm_state.filters[gen]);
 691 +       if (!filter)
 692 +               return true;
 693 +
 694 +       get_item_key(item, key);
 695 +
 696 +       return test_bit(key[0], filter) && test_bit(key[1], filter);
 697 +}
 698 +
 699 +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
 700 +{
 701 +       int i;
 702 +       int hist;
 703 +
 704 +       lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
 705 +
 706 +       if (walk) {
 707 +               hist = lru_hist_from_seq(walk->max_seq);
 708 +
 709 +               for (i = 0; i < NR_MM_STATS; i++) {
 710 +                       WRITE_ONCE(lruvec->mm_state.stats[hist][i],
 711 +                                  lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
 712 +                       walk->mm_stats[i] = 0;
 713 +               }
 714 +       }
 715 +
 716 +       if (NR_HIST_GENS > 1 && last) {
 717 +               hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
 718 +
 719 +               for (i = 0; i < NR_MM_STATS; i++)
 720 +                       WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
 721 +       }
 722 +}
 723 +
 724 +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
 725 +{
 726 +       int type;
 727 +       unsigned long size = 0;
 728 +       struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
 729 +       int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
 730 +
 731 +       if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
 732 +               return true;
 733 +
 734 +       clear_bit(key, &mm->lru_gen.bitmap);
 735 +
 736 +       for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
 737 +               size += type ? get_mm_counter(mm, MM_FILEPAGES) :
 738 +                              get_mm_counter(mm, MM_ANONPAGES) +
 739 +                              get_mm_counter(mm, MM_SHMEMPAGES);
 740 +       }
 741 +
 742 +       if (size < MIN_LRU_BATCH)
 743 +               return true;
 744 +
 745 +       return !mmget_not_zero(mm);
 746 +}
 747 +
 748 +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 749 +                           struct mm_struct **iter)
 750 +{
 751 +       bool first = false;
 752 +       bool last = true;
 753 +       struct mm_struct *mm = NULL;
 754 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 755 +       struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 756 +       struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
 757 +
 758 +       /*
 759 +        * There are four interesting cases for this page table walker:
 760 +        * 1. It tries to start a new iteration of mm_list with a stale max_seq;
 761 +        *    there is nothing left to do.
 762 +        * 2. It's the first of the current generation, and it needs to reset
 763 +        *    the Bloom filter for the next generation.
 764 +        * 3. It reaches the end of mm_list, and it needs to increment
 765 +        *    mm_state->seq; the iteration is done.
 766 +        * 4. It's the last of the current generation, and it needs to reset the
 767 +        *    mm stats counters for the next generation.
 768 +        */
 769 +       spin_lock(&mm_list->lock);
 770 +
 771 +       VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
 772 +       VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
 773 +       VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
 774 +
 775 +       if (walk->max_seq <= mm_state->seq) {
 776 +               if (!*iter)
 777 +                       last = false;
 778 +               goto done;
 779 +       }
 780 +
 781 +       if (!mm_state->nr_walkers) {
 782 +               VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
 783 +
 784 +               mm_state->head = mm_list->fifo.next;
 785 +               first = true;
 786 +       }
 787 +
 788 +       while (!mm && mm_state->head != &mm_list->fifo) {
 789 +               mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 790 +
 791 +               mm_state->head = mm_state->head->next;
 792 +
 793 +               /* force scan for those added after the last iteration */
 794 +               if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
 795 +                       mm_state->tail = mm_state->head;
 796 +                       walk->force_scan = true;
 797 +               }
 798 +
 799 +               if (should_skip_mm(mm, walk))
 800 +                       mm = NULL;
 801 +       }
 802 +
 803 +       if (mm_state->head == &mm_list->fifo)
 804 +               WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 805 +done:
 806 +       if (*iter && !mm)
 807 +               mm_state->nr_walkers--;
 808 +       if (!*iter && mm)
 809 +               mm_state->nr_walkers++;
 810 +
 811 +       if (mm_state->nr_walkers)
 812 +               last = false;
 813 +
 814 +       if (*iter || last)
 815 +               reset_mm_stats(lruvec, walk, last);
 816 +
 817 +       spin_unlock(&mm_list->lock);
 818 +
 819 +       if (mm && first)
 820 +               reset_bloom_filter(lruvec, walk->max_seq + 1);
 821 +
 822 +       if (*iter)
 823 +               mmput_async(*iter);
 824 +
 825 +       *iter = mm;
 826 +
 827 +       return last;
 828 +}
 829 +
 830 +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
 831 +{
 832 +       bool success = false;
 833 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 834 +       struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 835 +       struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
 836 +
 837 +       spin_lock(&mm_list->lock);
 838 +
 839 +       VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
 840 +
 841 +       if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
 842 +               VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
 843 +
 844 +               WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 845 +               reset_mm_stats(lruvec, NULL, true);
 846 +               success = true;
 847 +       }
 848 +
 849 +       spin_unlock(&mm_list->lock);
 850 +
 851 +       return success;
 852 +}
 853 +
 854 +/******************************************************************************
 855   *                          refault feedback loop
 856   ******************************************************************************/
 857
 858 @@ -3048,6 +3415,118 @@ static int page_inc_gen(struct lruvec *l
 859         return new_gen;
 860  }
 861
 862 +static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page,
 863 +                             int old_gen, int new_gen)
 864 +{
 865 +       int type = page_is_file_lru(page);
 866 +       int zone = page_zonenum(page);
 867 +       int delta = thp_nr_pages(page);
 868 +
 869 +       VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
 870 +       VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
 871 +
 872 +       walk->batched++;
 873 +
 874 +       walk->nr_pages[old_gen][type][zone] -= delta;
 875 +       walk->nr_pages[new_gen][type][zone] += delta;
 876 +}
 877 +
 878 +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
 879 +{
 880 +       int gen, type, zone;
 881 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
 882 +
 883 +       walk->batched = 0;
 884 +
 885 +       for_each_gen_type_zone(gen, type, zone) {
 886 +               enum lru_list lru = type * LRU_INACTIVE_FILE;
 887 +               int delta = walk->nr_pages[gen][type][zone];
 888 +
 889 +               if (!delta)
 890 +                       continue;
 891 +
 892 +               walk->nr_pages[gen][type][zone] = 0;
 893 +               WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
 894 +                          lrugen->nr_pages[gen][type][zone] + delta);
 895 +
 896 +               if (lru_gen_is_active(lruvec, gen))
 897 +                       lru += LRU_ACTIVE;
 898 +               __update_lru_size(lruvec, lru, zone, delta);
 899 +       }
 900 +}
 901 +
 902 +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
 903 +{
 904 +       struct address_space *mapping;
 905 +       struct vm_area_struct *vma = args->vma;
 906 +       struct lru_gen_mm_walk *walk = args->private;
 907 +
 908 +       if (!vma_is_accessible(vma))
 909 +               return true;
 910 +
 911 +       if (is_vm_hugetlb_page(vma))
 912 +               return true;
 913 +
 914 +       if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
 915 +               return true;
 916 +
 917 +       if (vma == get_gate_vma(vma->vm_mm))
 918 +               return true;
 919 +
 920 +       if (vma_is_anonymous(vma))
 921 +               return !walk->can_swap;
 922 +
 923 +       if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
 924 +               return true;
 925 +
 926 +       mapping = vma->vm_file->f_mapping;
 927 +       if (mapping_unevictable(mapping))
 928 +               return true;
 929 +
 930 +       if (shmem_mapping(mapping))
 931 +               return !walk->can_swap;
 932 +
 933 +       /* to exclude special mappings like dax, etc. */
 934 +       return !mapping->a_ops->readpage;
 935 +}
 936 +
 937 +/*
 938 + * Some userspace memory allocators map many single-page VMAs. Instead of
 939 + * returning back to the PGD table for each of such VMAs, finish an entire PMD
 940 + * table to reduce zigzags and improve cache performance.
 941 + */
 942 +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
 943 +                        unsigned long *vm_start, unsigned long *vm_end)
 944 +{
 945 +       unsigned long start = round_up(*vm_end, size);
 946 +       unsigned long end = (start | ~mask) + 1;
 947 +
 948 +       VM_WARN_ON_ONCE(mask & size);
 949 +       VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
 950 +
 951 +       while (args->vma) {
 952 +               if (start >= args->vma->vm_end) {
 953 +                       args->vma = args->vma->vm_next;
 954 +                       continue;
 955 +               }
 956 +
 957 +               if (end && end <= args->vma->vm_start)
 958 +                       return false;
 959 +
 960 +               if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) {
 961 +                       args->vma = args->vma->vm_next;
 962 +                       continue;
 963 +               }
 964 +
 965 +               *vm_start = max(start, args->vma->vm_start);
 966 +               *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
 967 +
 968 +               return true;
 969 +       }
 970 +
 971 +       return false;
 972 +}
 973 +
 974  static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
 975  {
 976         unsigned long pfn = pte_pfn(pte);
 977 @@ -3066,8 +3545,28 @@ static unsigned long get_pte_pfn(pte_t p
 978         return pfn;
 979  }
 980
 981 +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
 982 +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
 983 +{
 984 +       unsigned long pfn = pmd_pfn(pmd);
 985 +
 986 +       VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
 987 +
 988 +       if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
 989 +               return -1;
 990 +
 991 +       if (WARN_ON_ONCE(pmd_devmap(pmd)))
 992 +               return -1;
 993 +
 994 +       if (WARN_ON_ONCE(!pfn_valid(pfn)))
 995 +               return -1;
 996 +
 997 +       return pfn;
 998 +}
 999 +#endif
1000 +
1001  static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
1002 -                                struct pglist_data *pgdat)
1003 +                                struct pglist_data *pgdat, bool can_swap)
1004  {
1005         struct page *page;
1006
1007 @@ -3082,9 +3581,375 @@ static struct page *get_pfn_page(unsigne
1008         if (page_memcg_rcu(page) != memcg)
1009                 return NULL;
1010
1011 +       /* file VMAs can contain anon pages from COW */
1012 +       if (!page_is_file_lru(page) && !can_swap)
1013 +               return NULL;
1014 +
1015         return page;
1016  }
1017
1018 +static bool suitable_to_scan(int total, int young)
1019 +{
1020 +       int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
1021 +
1022 +       /* suitable if the average number of young PTEs per cacheline is >=1 */
1023 +       return young * n >= total;
1024 +}
1025 +
1026 +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
1027 +                          struct mm_walk *args)
1028 +{
1029 +       int i;
1030 +       pte_t *pte;
1031 +       spinlock_t *ptl;
1032 +       unsigned long addr;
1033 +       int total = 0;
1034 +       int young = 0;
1035 +       struct lru_gen_mm_walk *walk = args->private;
1036 +       struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
1037 +       struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
1038 +       int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
1039 +
1040 +       VM_WARN_ON_ONCE(pmd_leaf(*pmd));
1041 +
1042 +       ptl = pte_lockptr(args->mm, pmd);
1043 +       if (!spin_trylock(ptl))
1044 +               return false;
1045 +
1046 +       arch_enter_lazy_mmu_mode();
1047 +
1048 +       pte = pte_offset_map(pmd, start & PMD_MASK);
1049 +restart:
1050 +       for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
1051 +               unsigned long pfn;
1052 +               struct page *page;
1053 +
1054 +               total++;
1055 +               walk->mm_stats[MM_LEAF_TOTAL]++;
1056 +
1057 +               pfn = get_pte_pfn(pte[i], args->vma, addr);
1058 +               if (pfn == -1)
1059 +                       continue;
1060 +
1061 +               if (!pte_young(pte[i])) {
1062 +                       walk->mm_stats[MM_LEAF_OLD]++;
1063 +                       continue;
1064 +               }
1065 +
1066 +               page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
1067 +               if (!page)
1068 +                       continue;
1069 +
1070 +               if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
1071 +                       VM_WARN_ON_ONCE(true);
1072 +
1073 +               young++;
1074 +               walk->mm_stats[MM_LEAF_YOUNG]++;
1075 +
1076 +               if (pte_dirty(pte[i]) && !PageDirty(page) &&
1077 +                   !(PageAnon(page) && PageSwapBacked(page) &&
1078 +                     !PageSwapCache(page)))
1079 +                       set_page_dirty(page);
1080 +
1081 +               old_gen = page_update_gen(page, new_gen);
1082 +               if (old_gen >= 0 && old_gen != new_gen)
1083 +                       update_batch_size(walk, page, old_gen, new_gen);
1084 +       }
1085 +
1086 +       if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
1087 +               goto restart;
1088 +
1089 +       pte_unmap(pte);
1090 +
1091 +       arch_leave_lazy_mmu_mode();
1092 +       spin_unlock(ptl);
1093 +
1094 +       return suitable_to_scan(total, young);
1095 +}
1096 +
1097 +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
1098 +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
1099 +                                 struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
1100 +{
1101 +       int i;
1102 +       pmd_t *pmd;
1103 +       spinlock_t *ptl;
1104 +       struct lru_gen_mm_walk *walk = args->private;
1105 +       struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
1106 +       struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
1107 +       int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
1108 +
1109 +       VM_WARN_ON_ONCE(pud_leaf(*pud));
1110 +
1111 +       /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
1112 +       if (*start == -1) {
1113 +               *start = next;
1114 +               return;
1115 +       }
1116 +
1117 +       i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
1118 +       if (i && i <= MIN_LRU_BATCH) {
1119 +               __set_bit(i - 1, bitmap);
1120 +               return;
1121 +       }
1122 +
1123 +       pmd = pmd_offset(pud, *start);
1124 +
1125 +       ptl = pmd_lockptr(args->mm, pmd);
1126 +       if (!spin_trylock(ptl))
1127 +               goto done;
1128 +
1129 +       arch_enter_lazy_mmu_mode();
1130 +
1131 +       do {
1132 +               unsigned long pfn;
1133 +               struct page *page;
1134 +               unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
1135 +
1136 +               pfn = get_pmd_pfn(pmd[i], vma, addr);
1137 +               if (pfn == -1)
1138 +                       goto next;
1139 +
1140 +               if (!pmd_trans_huge(pmd[i])) {
1141 +                       if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
1142 +                               pmdp_test_and_clear_young(vma, addr, pmd + i);
1143 +                       goto next;
1144 +               }
1145 +
1146 +               page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
1147 +               if (!page)
1148 +                       goto next;
1149 +
1150 +               if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
1151 +                       goto next;
1152 +
1153 +               walk->mm_stats[MM_LEAF_YOUNG]++;
1154 +
1155 +               if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
1156 +                   !(PageAnon(page) && PageSwapBacked(page) &&
1157 +                     !PageSwapCache(page)))
1158 +                       set_page_dirty(page);
1159 +
1160 +               old_gen = page_update_gen(page, new_gen);
1161 +               if (old_gen >= 0 && old_gen != new_gen)
1162 +                       update_batch_size(walk, page, old_gen, new_gen);
1163 +next:
1164 +               i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
1165 +       } while (i <= MIN_LRU_BATCH);
1166 +
1167 +       arch_leave_lazy_mmu_mode();
1168 +       spin_unlock(ptl);
1169 +done:
1170 +       *start = -1;
1171 +       bitmap_zero(bitmap, MIN_LRU_BATCH);
1172 +}
1173 +#else
1174 +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
1175 +                                 struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
1176 +{
1177 +}
1178 +#endif
1179 +
1180 +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
1181 +                          struct mm_walk *args)
1182 +{
1183 +       int i;
1184 +       pmd_t *pmd;
1185 +       unsigned long next;
1186 +       unsigned long addr;
1187 +       struct vm_area_struct *vma;
1188 +       unsigned long pos = -1;
1189 +       struct lru_gen_mm_walk *walk = args->private;
1190 +       unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
1191 +
1192 +       VM_WARN_ON_ONCE(pud_leaf(*pud));
1193 +
1194 +       /*
1195 +        * Finish an entire PMD in two passes: the first only reaches to PTE
1196 +        * tables to avoid taking the PMD lock; the second, if necessary, takes
1197 +        * the PMD lock to clear the accessed bit in PMD entries.
1198 +        */
1199 +       pmd = pmd_offset(pud, start & PUD_MASK);
1200 +restart:
1201 +       /* walk_pte_range() may call get_next_vma() */
1202 +       vma = args->vma;
1203 +       for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
1204 +               pmd_t val = pmd_read_atomic(pmd + i);
1205 +
1206 +               /* for pmd_read_atomic() */
1207 +               barrier();
1208 +
1209 +               next = pmd_addr_end(addr, end);
1210 +
1211 +               if (!pmd_present(val) || is_huge_zero_pmd(val)) {
1212 +                       walk->mm_stats[MM_LEAF_TOTAL]++;
1213 +                       continue;
1214 +               }
1215 +
1216 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1217 +               if (pmd_trans_huge(val)) {
1218 +                       unsigned long pfn = pmd_pfn(val);
1219 +                       struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
1220 +
1221 +                       walk->mm_stats[MM_LEAF_TOTAL]++;
1222 +
1223 +                       if (!pmd_young(val)) {
1224 +                               walk->mm_stats[MM_LEAF_OLD]++;
1225 +                               continue;
1226 +                       }
1227 +
1228 +                       /* try to avoid unnecessary memory loads */
1229 +                       if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
1230 +                               continue;
1231 +
1232 +                       walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
1233 +                       continue;
1234 +               }
1235 +#endif
1236 +               walk->mm_stats[MM_NONLEAF_TOTAL]++;
1237 +
1238 +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
1239 +               if (!pmd_young(val))
1240 +                       continue;
1241 +
1242 +               walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
1243 +#endif
1244 +               if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
1245 +                       continue;
1246 +
1247 +               walk->mm_stats[MM_NONLEAF_FOUND]++;
1248 +
1249 +               if (!walk_pte_range(&val, addr, next, args))
1250 +                       continue;
1251 +
1252 +               walk->mm_stats[MM_NONLEAF_ADDED]++;
1253 +
1254 +               /* carry over to the next generation */
1255 +               update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
1256 +       }
1257 +
1258 +       walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
1259 +
1260 +       if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
1261 +               goto restart;
1262 +}
1263 +
1264 +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
1265 +                         struct mm_walk *args)
1266 +{
1267 +       int i;
1268 +       pud_t *pud;
1269 +       unsigned long addr;
1270 +       unsigned long next;
1271 +       struct lru_gen_mm_walk *walk = args->private;
1272 +
1273 +       VM_WARN_ON_ONCE(p4d_leaf(*p4d));
1274 +
1275 +       pud = pud_offset(p4d, start & P4D_MASK);
1276 +restart:
1277 +       for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
1278 +               pud_t val = READ_ONCE(pud[i]);
1279 +
1280 +               next = pud_addr_end(addr, end);
1281 +
1282 +               if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
1283 +                       continue;
1284 +
1285 +               walk_pmd_range(&val, addr, next, args);
1286 +
1287 +               /* a racy check to curtail the waiting time */
1288 +               if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
1289 +                       return 1;
1290 +
1291 +               if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
1292 +                       end = (addr | ~PUD_MASK) + 1;
1293 +                       goto done;
1294 +               }
1295 +       }
1296 +
1297 +       if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
1298 +               goto restart;
1299 +
1300 +       end = round_up(end, P4D_SIZE);
1301 +done:
1302 +       if (!end || !args->vma)
1303 +               return 1;
1304 +
1305 +       walk->next_addr = max(end, args->vma->vm_start);
1306 +
1307 +       return -EAGAIN;
1308 +}
1309 +
1310 +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
1311 +{
1312 +       static const struct mm_walk_ops mm_walk_ops = {
1313 +               .test_walk = should_skip_vma,
1314 +               .p4d_entry = walk_pud_range,
1315 +       };
1316 +
1317 +       int err;
1318 +       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
1319 +
1320 +       walk->next_addr = FIRST_USER_ADDRESS;
1321 +
1322 +       do {
1323 +               err = -EBUSY;
1324 +
1325 +               /* page_update_gen() requires stable page_memcg() */
1326 +               if (!mem_cgroup_trylock_pages(memcg))
1327 +                       break;
1328 +
1329 +               /* the caller might be holding the lock for write */
1330 +               if (mmap_read_trylock(mm)) {
1331 +                       err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
1332 +
1333 +                       mmap_read_unlock(mm);
1334 +               }
1335 +
1336 +               mem_cgroup_unlock_pages();
1337 +
1338 +               if (walk->batched) {
1339 +                       spin_lock_irq(&lruvec->lru_lock);
1340 +                       reset_batch_size(lruvec, walk);
1341 +                       spin_unlock_irq(&lruvec->lru_lock);
1342 +               }
1343 +
1344 +               cond_resched();
1345 +       } while (err == -EAGAIN);
1346 +}
1347 +
1348 +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
1349 +{
1350 +       struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
1351 +
1352 +       if (pgdat && current_is_kswapd()) {
1353 +               VM_WARN_ON_ONCE(walk);
1354 +
1355 +               walk = &pgdat->mm_walk;
1356 +       } else if (!pgdat && !walk) {
1357 +               VM_WARN_ON_ONCE(current_is_kswapd());
1358 +
1359 +               walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
1360 +       }
1361 +
1362 +       current->reclaim_state->mm_walk = walk;
1363 +
1364 +       return walk;
1365 +}
1366 +
1367 +static void clear_mm_walk(void)
1368 +{
1369 +       struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
1370 +
1371 +       VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
1372 +       VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
1373 +
1374 +       current->reclaim_state->mm_walk = NULL;
1375 +
1376 +       if (!current_is_kswapd())
1377 +               kfree(walk);
1378 +}
1379 +
1380  static void inc_min_seq(struct lruvec *lruvec, int type)
1381  {
1382         struct lru_gen_struct *lrugen = &lruvec->lrugen;
1383 @@ -3136,7 +4001,7 @@ next:
1384         return success;
1385  }
1386
1387 -static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
1388 +static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
1389  {
1390         int prev, next;
1391         int type, zone;
1392 @@ -3146,9 +4011,6 @@ static void inc_max_seq(struct lruvec *l
1393
1394         VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
1395
1396 -       if (max_seq != lrugen->max_seq)
1397 -               goto unlock;
1398 -
1399         for (type = ANON_AND_FILE - 1; type >= 0; type--) {
1400                 if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
1401                         continue;
1402 @@ -3186,10 +4048,76 @@ static void inc_max_seq(struct lruvec *l
1403
1404         /* make sure preceding modifications appear */
1405         smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
1406 -unlock:
1407 +
1408         spin_unlock_irq(&lruvec->lru_lock);
1409  }
1410
1411 +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
1412 +                              struct scan_control *sc, bool can_swap)
1413 +{
1414 +       bool success;
1415 +       struct lru_gen_mm_walk *walk;
1416 +       struct mm_struct *mm = NULL;
1417 +       struct lru_gen_struct *lrugen = &lruvec->lrugen;
1418 +
1419 +       VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
1420 +
1421 +       /* see the comment in iterate_mm_list() */
1422 +       if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
1423 +               success = false;
1424 +               goto done;
1425 +       }
1426 +
1427 +       /*
1428 +        * If the hardware doesn't automatically set the accessed bit, fallback
1429 +        * to lru_gen_look_around(), which only clears the accessed bit in a
1430 +        * handful of PTEs. Spreading the work out over a period of time usually
1431 +        * is less efficient, but it avoids bursty page faults.
1432 +        */
1433 +       if (!arch_has_hw_pte_young()) {
1434 +               success = iterate_mm_list_nowalk(lruvec, max_seq);
1435 +               goto done;
1436 +       }
1437 +
1438 +       walk = set_mm_walk(NULL);
1439 +       if (!walk) {
1440 +               success = iterate_mm_list_nowalk(lruvec, max_seq);
1441 +               goto done;
1442 +       }
1443 +
1444 +       walk->lruvec = lruvec;
1445 +       walk->max_seq = max_seq;
1446 +       walk->can_swap = can_swap;
1447 +       walk->force_scan = false;
1448 +
1449 +       do {
1450 +               success = iterate_mm_list(lruvec, walk, &mm);
1451 +               if (mm)
1452 +                       walk_mm(lruvec, mm, walk);
1453 +
1454 +               cond_resched();
1455 +       } while (mm);
1456 +done:
1457 +       if (!success) {
1458 +               if (sc->priority <= DEF_PRIORITY - 2)
1459 +                       wait_event_killable(lruvec->mm_state.wait,
1460 +                                           max_seq < READ_ONCE(lrugen->max_seq));
1461 +
1462 +               return max_seq < READ_ONCE(lrugen->max_seq);
1463 +       }
1464 +
1465 +       VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
1466 +
1467 +       inc_max_seq(lruvec, can_swap);
1468 +       /* either this sees any waiters or they will see updated max_seq */
1469 +       if (wq_has_sleeper(&lruvec->mm_state.wait))
1470 +               wake_up_all(&lruvec->mm_state.wait);
1471 +
1472 +       wakeup_flusher_threads(WB_REASON_VMSCAN);
1473 +
1474 +       return true;
1475 +}
1476 +
1477  static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
1478                              struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
1479  {
1480 @@ -3265,7 +4193,7 @@ static void age_lruvec(struct lruvec *lr
1481
1482         need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
1483         if (need_aging)
1484 -               inc_max_seq(lruvec, max_seq, swappiness);
1485 +               try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
1486  }
1487
1488  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
1489 @@ -3274,6 +4202,8 @@ static void lru_gen_age_node(struct pgli
1490
1491         VM_WARN_ON_ONCE(!current_is_kswapd());
1492
1493 +       set_mm_walk(pgdat);
1494 +
1495         memcg = mem_cgroup_iter(NULL, NULL, NULL);
1496         do {
1497                 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
1498 @@ -3282,11 +4212,16 @@ static void lru_gen_age_node(struct pgli
1499
1500                 cond_resched();
1501         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
1502 +
1503 +       clear_mm_walk();
1504  }
1505
1506  /*
1507   * This function exploits spatial locality when shrink_page_list() walks the
1508 - * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
1509 + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
1510 + * the scan was done cacheline efficiently, it adds the PMD entry pointing to
1511 + * the PTE table to the Bloom filter. This forms a feedback loop between the
1512 + * eviction and the aging.
1513   */
1514  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
1515  {
1516 @@ -3295,6 +4230,8 @@ void lru_gen_look_around(struct page_vma
1517         unsigned long start;
1518         unsigned long end;
1519         unsigned long addr;
1520 +       struct lru_gen_mm_walk *walk;
1521 +       int young = 0;
1522         unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
1523         struct page *page = pvmw->page;
1524         struct mem_cgroup *memcg = page_memcg(page);
1525 @@ -3309,6 +4246,9 @@ void lru_gen_look_around(struct page_vma
1526         if (spin_is_contended(pvmw->ptl))
1527                 return;
1528
1529 +       /* avoid taking the LRU lock under the PTL when possible */
1530 +       walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
1531 +
1532         start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
1533         end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
1534
1535 @@ -3338,13 +4278,15 @@ void lru_gen_look_around(struct page_vma
1536                 if (!pte_young(pte[i]))
1537                         continue;
1538
1539 -               page = get_pfn_page(pfn, memcg, pgdat);
1540 +               page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap);
1541                 if (!page)
1542                         continue;
1543
1544                 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
1545                         VM_WARN_ON_ONCE(true);
1546
1547 +               young++;
1548 +
1549                 if (pte_dirty(pte[i]) && !PageDirty(page) &&
1550                     !(PageAnon(page) && PageSwapBacked(page) &&
1551                       !PageSwapCache(page)))
1552 @@ -3360,7 +4302,11 @@ void lru_gen_look_around(struct page_vma
1553         arch_leave_lazy_mmu_mode();
1554         rcu_read_unlock();
1555
1556 -       if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
1557 +       /* feedback from rmap walkers to page table walkers */
1558 +       if (suitable_to_scan(i, young))
1559 +               update_bloom_filter(lruvec, max_seq, pvmw->pmd);
1560 +
1561 +       if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
1562                 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
1563                         page = pte_page(pte[i]);
1564                         activate_page(page);
1565 @@ -3372,8 +4318,10 @@ void lru_gen_look_around(struct page_vma
1566         if (!mem_cgroup_trylock_pages(memcg))
1567                 return;
1568
1569 -       spin_lock_irq(&lruvec->lru_lock);
1570 -       new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
1571 +       if (!walk) {
1572 +               spin_lock_irq(&lruvec->lru_lock);
1573 +               new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
1574 +       }
1575
1576         for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
1577                 page = compound_head(pte_page(pte[i]));
1578 @@ -3384,10 +4332,14 @@ void lru_gen_look_around(struct page_vma
1579                 if (old_gen < 0 || old_gen == new_gen)
1580                         continue;
1581
1582 -               lru_gen_update_size(lruvec, page, old_gen, new_gen);
1583 +               if (walk)
1584 +                       update_batch_size(walk, page, old_gen, new_gen);
1585 +               else
1586 +                       lru_gen_update_size(lruvec, page, old_gen, new_gen);
1587         }
1588
1589 -       spin_unlock_irq(&lruvec->lru_lock);
1590 +       if (!walk)
1591 +               spin_unlock_irq(&lruvec->lru_lock);
1592
1593         mem_cgroup_unlock_pages();
1594  }
1595 @@ -3670,6 +4622,7 @@ static int evict_pages(struct lruvec *lr
1596         struct page *page;
1597         enum vm_event_item item;
1598         struct reclaim_stat stat;
1599 +       struct lru_gen_mm_walk *walk;
1600         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
1601         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1602
1603 @@ -3706,6 +4659,10 @@ static int evict_pages(struct lruvec *lr
1604
1605         move_pages_to_lru(lruvec, &list);
1606
1607 +       walk = current->reclaim_state->mm_walk;
1608 +       if (walk && walk->batched)
1609 +               reset_batch_size(lruvec, walk);
1610 +
1611         item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
1612         if (!cgroup_reclaim(sc))
1613                 __count_vm_events(item, reclaimed);
1614 @@ -3722,6 +4679,11 @@ static int evict_pages(struct lruvec *lr
1615         return scanned;
1616  }
1617
1618 +/*
1619 + * For future optimizations:
1620 + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
1621 + *    reclaim.
1622 + */
1623  static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
1624                                     bool can_swap)
1625  {
1626 @@ -3747,7 +4709,8 @@ static unsigned long get_nr_to_scan(stru
1627         if (current_is_kswapd())
1628                 return 0;
1629
1630 -       inc_max_seq(lruvec, max_seq, can_swap);
1631 +       if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
1632 +               return nr_to_scan;
1633  done:
1634         return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
1635  }
1636 @@ -3761,6 +4724,8 @@ static void lru_gen_shrink_lruvec(struct
1637
1638         blk_start_plug(&plug);
1639
1640 +       set_mm_walk(lruvec_pgdat(lruvec));
1641 +
1642         while (true) {
1643                 int delta;
1644                 int swappiness;
1645 @@ -3788,6 +4753,8 @@ static void lru_gen_shrink_lruvec(struct
1646                 cond_resched();
1647         }
1648
1649 +       clear_mm_walk();
1650 +
1651         blk_finish_plug(&plug);
1652  }
1653
1654 @@ -3804,15 +4771,21 @@ void lru_gen_init_lruvec(struct lruvec *
1655
1656         for_each_gen_type_zone(gen, type, zone)
1657                 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
1658 +
1659 +       lruvec->mm_state.seq = MIN_NR_GENS;
1660 +       init_waitqueue_head(&lruvec->mm_state.wait);
1661  }
1662
1663  #ifdef CONFIG_MEMCG
1664  void lru_gen_init_memcg(struct mem_cgroup *memcg)
1665  {
1666 +       INIT_LIST_HEAD(&memcg->mm_list.fifo);
1667 +       spin_lock_init(&memcg->mm_list.lock);
1668  }
1669
1670  void lru_gen_exit_memcg(struct mem_cgroup *memcg)
1671  {
1672 +       int i;
1673         int nid;
1674
1675         for_each_node(nid) {
1676 @@ -3820,6 +4793,11 @@ void lru_gen_exit_memcg(struct mem_cgrou
1677
1678                 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
1679                                            sizeof(lruvec->lrugen.nr_pages)));
1680 +
1681 +               for (i = 0; i < NR_BLOOM_FILTERS; i++) {
1682 +                       bitmap_free(lruvec->mm_state.filters[i]);
1683 +                       lruvec->mm_state.filters[i] = NULL;
1684 +               }
1685         }
1686  }
1687  #endif