target/linux/generic/backport-5.15/021-v6.4-mm-Multi-gen-LRU-remove-wait_event_killable.patch

   1 From 087ed25eaf5a78a678508e893f80addab9b1c103 Mon Sep 17 00:00:00 2001
   2 From: Kalesh Singh <kaleshsingh@google.com>
   3 Date: Thu, 13 Apr 2023 14:43:26 -0700
   4 Subject: [PATCH] mm: Multi-gen LRU: remove wait_event_killable()
   5
   6 Android 14 and later default to MGLRU [1] and field telemetry showed
   7 occasional long tail latency (>100ms) in the reclaim path.
   8
   9 Tracing revealed priority inversion in the reclaim path.  In
  10 try_to_inc_max_seq(), when high priority tasks were blocked on
  11 wait_event_killable(), the preemption of the low priority task to call
  12 wake_up_all() caused those high priority tasks to wait longer than
  13 necessary.  In general, this problem is not different from others of its
  14 kind, e.g., one caused by mutex_lock().  However, it is specific to MGLRU
  15 because it introduced the new wait queue lruvec->mm_state.wait.
  16
  17 The purpose of this new wait queue is to avoid the thundering herd
  18 problem.  If many direct reclaimers rush into try_to_inc_max_seq(), only
  19 one can succeed, i.e., the one to wake up the rest, and the rest who
  20 failed might cause premature OOM kills if they do not wait.  So far there
  21 is no evidence supporting this scenario, based on how often the wait has
  22 been hit.  And this begs the question how useful the wait queue is in
  23 practice.
  24
  25 Based on Minchan's recommendation, which is in line with his commit
  26 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
  27 rest of the MGLRU code which also uses trylock when possible, remove the
  28 wait queue.
  29
  30 [1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf
  31
  32 Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
  33 Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
  34 Change-Id: I911f3968fd1adb25171279cc5b6f48ccb7efc8de
  35 Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
  36 Suggested-by: Minchan Kim <minchan@kernel.org>
  37 Reported-by: Wei Wang <wvw@google.com>
  38 Acked-by: Yu Zhao <yuzhao@google.com>
  39 Cc: Minchan Kim <minchan@kernel.org>
  40 Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
  41 Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
  42 Cc: Suleiman Souhlal <suleiman@google.com>
  43 Cc: Suren Baghdasaryan <surenb@google.com>
  44 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  45 ---
  46  include/linux/mmzone.h |   8 +--
  47  mm/vmscan.c            | 111 +++++++++++++++--------------------------
  48  2 files changed, 42 insertions(+), 77 deletions(-)
  49
  50 --- a/include/linux/mmzone.h
  51 +++ b/include/linux/mmzone.h
  52 @@ -452,18 +452,14 @@ enum {
  53  struct lru_gen_mm_state {
  54         /* set to max_seq after each iteration */
  55         unsigned long seq;
  56 -       /* where the current iteration continues (inclusive) */
  57 +       /* where the current iteration continues after */
  58         struct list_head *head;
  59 -       /* where the last iteration ended (exclusive) */
  60 +       /* where the last iteration ended before */
  61         struct list_head *tail;
  62 -       /* to wait for the last page table walker to finish */
  63 -       struct wait_queue_head wait;
  64         /* Bloom filters flip after each iteration */
  65         unsigned long *filters[NR_BLOOM_FILTERS];
  66         /* the mm stats for debugging */
  67         unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
  68 -       /* the number of concurrent page table walkers */
  69 -       int nr_walkers;
  70  };
  71
  72  struct lru_gen_mm_walk {
  73 --- a/mm/vmscan.c
  74 +++ b/mm/vmscan.c
  75 @@ -2999,18 +2999,13 @@ void lru_gen_del_mm(struct mm_struct *mm
  76                 if (!lruvec)
  77                         continue;
  78
  79 -               /* where the last iteration ended (exclusive) */
  80 +               /* where the current iteration continues after */
  81 +               if (lruvec->mm_state.head == &mm->lru_gen.list)
  82 +                       lruvec->mm_state.head = lruvec->mm_state.head->prev;
  83 +
  84 +               /* where the last iteration ended before */
  85                 if (lruvec->mm_state.tail == &mm->lru_gen.list)
  86                         lruvec->mm_state.tail = lruvec->mm_state.tail->next;
  87 -
  88 -               /* where the current iteration continues (inclusive) */
  89 -               if (lruvec->mm_state.head != &mm->lru_gen.list)
  90 -                       continue;
  91 -
  92 -               lruvec->mm_state.head = lruvec->mm_state.head->next;
  93 -               /* the deletion ends the current iteration */
  94 -               if (lruvec->mm_state.head == &mm_list->fifo)
  95 -                       WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
  96         }
  97
  98         list_del_init(&mm->lru_gen.list);
  99 @@ -3194,68 +3189,54 @@ static bool iterate_mm_list(struct lruve
 100                             struct mm_struct **iter)
 101  {
 102         bool first = false;
 103 -       bool last = true;
 104 +       bool last = false;
 105         struct mm_struct *mm = NULL;
 106         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 107         struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 108         struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
 109
 110         /*
 111 -        * There are four interesting cases for this page table walker:
 112 -        * 1. It tries to start a new iteration of mm_list with a stale max_seq;
 113 -        *    there is nothing left to do.
 114 -        * 2. It's the first of the current generation, and it needs to reset
 115 -        *    the Bloom filter for the next generation.
 116 -        * 3. It reaches the end of mm_list, and it needs to increment
 117 -        *    mm_state->seq; the iteration is done.
 118 -        * 4. It's the last of the current generation, and it needs to reset the
 119 -        *    mm stats counters for the next generation.
 120 +        * mm_state->seq is incremented after each iteration of mm_list. There
 121 +        * are three interesting cases for this page table walker:
 122 +        * 1. It tries to start a new iteration with a stale max_seq: there is
 123 +        *    nothing left to do.
 124 +        * 2. It started the next iteration: it needs to reset the Bloom filter
 125 +        *    so that a fresh set of PTE tables can be recorded.
 126 +        * 3. It ended the current iteration: it needs to reset the mm stats
 127 +        *    counters and tell its caller to increment max_seq.
 128          */
 129         spin_lock(&mm_list->lock);
 130
 131         VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
 132 -       VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
 133 -       VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
 134
 135 -       if (walk->max_seq <= mm_state->seq) {
 136 -               if (!*iter)
 137 -                       last = false;
 138 +       if (walk->max_seq <= mm_state->seq)
 139                 goto done;
 140 -       }
 141
 142 -       if (!mm_state->nr_walkers) {
 143 -               VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
 144 +       if (!mm_state->head)
 145 +               mm_state->head = &mm_list->fifo;
 146
 147 -               mm_state->head = mm_list->fifo.next;
 148 +       if (mm_state->head == &mm_list->fifo)
 149                 first = true;
 150 -       }
 151 -
 152 -       while (!mm && mm_state->head != &mm_list->fifo) {
 153 -               mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 154
 155 +       do {
 156                 mm_state->head = mm_state->head->next;
 157 +               if (mm_state->head == &mm_list->fifo) {
 158 +                       WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 159 +                       last = true;
 160 +                       break;
 161 +               }
 162
 163                 /* force scan for those added after the last iteration */
 164 -               if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
 165 -                       mm_state->tail = mm_state->head;
 166 +               if (!mm_state->tail || mm_state->tail == mm_state->head) {
 167 +                       mm_state->tail = mm_state->head->next;
 168                         walk->force_scan = true;
 169                 }
 170
 171 +               mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
 172                 if (should_skip_mm(mm, walk))
 173                         mm = NULL;
 174 -       }
 175 -
 176 -       if (mm_state->head == &mm_list->fifo)
 177 -               WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 178 +       } while (!mm);
 179  done:
 180 -       if (*iter && !mm)
 181 -               mm_state->nr_walkers--;
 182 -       if (!*iter && mm)
 183 -               mm_state->nr_walkers++;
 184 -
 185 -       if (mm_state->nr_walkers)
 186 -               last = false;
 187 -
 188         if (*iter || last)
 189                 reset_mm_stats(lruvec, walk, last);
 190
 191 @@ -3283,9 +3264,9 @@ static bool iterate_mm_list_nowalk(struc
 192
 193         VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
 194
 195 -       if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
 196 -               VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
 197 -
 198 +       if (max_seq > mm_state->seq) {
 199 +               mm_state->head = NULL;
 200 +               mm_state->tail = NULL;
 201                 WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
 202                 reset_mm_stats(lruvec, NULL, true);
 203                 success = true;
 204 @@ -3894,10 +3875,6 @@ restart:
 205
 206                 walk_pmd_range(&val, addr, next, args);
 207
 208 -               /* a racy check to curtail the waiting time */
 209 -               if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
 210 -                       return 1;
 211 -
 212                 if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
 213                         end = (addr | ~PUD_MASK) + 1;
 214                         goto done;
 215 @@ -3930,8 +3907,14 @@ static void walk_mm(struct lruvec *lruve
 216         walk->next_addr = FIRST_USER_ADDRESS;
 217
 218         do {
 219 +               DEFINE_MAX_SEQ(lruvec);
 220 +
 221                 err = -EBUSY;
 222
 223 +               /* another thread might have called inc_max_seq() */
 224 +               if (walk->max_seq != max_seq)
 225 +                       break;
 226 +
 227                 /* page_update_gen() requires stable page_memcg() */
 228                 if (!mem_cgroup_trylock_pages(memcg))
 229                         break;
 230 @@ -4164,25 +4147,12 @@ static bool try_to_inc_max_seq(struct lr
 231                 success = iterate_mm_list(lruvec, walk, &mm);
 232                 if (mm)
 233                         walk_mm(lruvec, mm, walk);
 234 -
 235 -               cond_resched();
 236         } while (mm);
 237  done:
 238 -       if (!success) {
 239 -               if (sc->priority <= DEF_PRIORITY - 2)
 240 -                       wait_event_killable(lruvec->mm_state.wait,
 241 -                                           max_seq < READ_ONCE(lrugen->max_seq));
 242 -               return false;
 243 -       }
 244 +       if (success)
 245 +               inc_max_seq(lruvec, can_swap, force_scan);
 246
 247 -       VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
 248 -
 249 -       inc_max_seq(lruvec, can_swap, force_scan);
 250 -       /* either this sees any waiters or they will see updated max_seq */
 251 -       if (wq_has_sleeper(&lruvec->mm_state.wait))
 252 -               wake_up_all(&lruvec->mm_state.wait);
 253 -
 254 -       return true;
 255 +       return success;
 256  }
 257
 258  static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
 259 @@ -5746,7 +5716,6 @@ void lru_gen_init_lruvec(struct lruvec *
 260                 INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
 261
 262         lruvec->mm_state.seq = MIN_NR_GENS;
 263 -       init_waitqueue_head(&lruvec->mm_state.wait);
 264  }
 265
 266  #ifdef CONFIG_MEMCG