target/linux/generic/backport-5.15/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch

   1 From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
   2 From: Yu Zhao <yuzhao@google.com>
   3 Date: Sun, 18 Sep 2022 02:00:06 -0600
   4 Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
   5 MIME-Version: 1.0
   6 Content-Type: text/plain; charset=UTF-8
   7 Content-Transfer-Encoding: 8bit
   8
   9 When multiple memcgs are available, it is possible to use generations as a
  10 frame of reference to make better choices and improve overall performance
  11 under global memory pressure.  This patch adds a basic optimization to
  12 select memcgs that can drop single-use unmapped clean pages first.  Doing
  13 so reduces the chance of going into the aging path or swapping, which can
  14 be costly.
  15
  16 A typical example that benefits from this optimization is a server running
  17 mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
  18 buffered I/O workload in the other.
  19
  20 Though this optimization can be applied to both kswapd and direct reclaim,
  21 it is only added to kswapd to keep the patchset manageable.  Later
  22 improvements may cover the direct reclaim path.
  23
  24 While ensuring certain fairness to all eligible memcgs, proportional scans
  25 of individual memcgs also require proper backoff to avoid overshooting
  26 their aggregate reclaim target by too much.  Otherwise it can cause high
  27 direct reclaim latency.  The conditions for backoff are:
  28
  29 1. At low priorities, for direct reclaim, if aging fairness or direct
  30    reclaim latency is at risk, i.e., aging one memcg multiple times or
  31    swapping after the target is met.
  32 2. At high priorities, for global reclaim, if per-zone free pages are
  33    above respective watermarks.
  34
  35 Server benchmark results:
  36   Mixed workloads:
  37     fio (buffered I/O): +[19, 21]%
  38                 IOPS         BW
  39       patch1-8: 1880k        7343MiB/s
  40       patch1-9: 2252k        8796MiB/s
  41
  42     memcached (anon): +[119, 123]%
  43                 Ops/sec      KB/sec
  44       patch1-8: 862768.65    33514.68
  45       patch1-9: 1911022.12   74234.54
  46
  47   Mixed workloads:
  48     fio (buffered I/O): +[75, 77]%
  49                 IOPS         BW
  50       5.19-rc1: 1279k        4996MiB/s
  51       patch1-9: 2252k        8796MiB/s
  52
  53     memcached (anon): +[13, 15]%
  54                 Ops/sec      KB/sec
  55       5.19-rc1: 1673524.04   65008.87
  56       patch1-9: 1911022.12   74234.54
  57
  58   Configurations:
  59     (changes since patch 6)
  60
  61     cat mixed.sh
  62     modprobe brd rd_nr=2 rd_size=56623104
  63
  64     swapoff -a
  65     mkswap /dev/ram0
  66     swapon /dev/ram0
  67
  68     mkfs.ext4 /dev/ram1
  69     mount -t ext4 /dev/ram1 /mnt
  70
  71     memtier_benchmark -S /var/run/memcached/memcached.sock \
  72       -P memcache_binary -n allkeys --key-minimum=1 \
  73       --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
  74       --ratio 1:0 --pipeline 8 -d 2000
  75
  76     fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
  77       --buffered=1 --ioengine=io_uring --iodepth=128 \
  78       --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
  79       --rw=randread --random_distribution=random --norandommap \
  80       --time_based --ramp_time=10m --runtime=90m --group_reporting &
  81     pid=$!
  82
  83     sleep 200
  84
  85     memtier_benchmark -S /var/run/memcached/memcached.sock \
  86       -P memcache_binary -n allkeys --key-minimum=1 \
  87       --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
  88       --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
  89
  90     kill -INT $pid
  91     wait
  92
  93 Client benchmark results:
  94   no change (CONFIG_MEMCG=n)
  95
  96 Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
  97 Signed-off-by: Yu Zhao <yuzhao@google.com>
  98 Acked-by: Brian Geffon <bgeffon@google.com>
  99 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 100 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 101 Acked-by: Steven Barrett <steven@liquorix.net>
 102 Acked-by: Suleiman Souhlal <suleiman@google.com>
 103 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 104 Tested-by: Donald Carr <d@chaos-reins.com>
 105 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 106 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 107 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 108 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 109 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 110 Cc: Andi Kleen <ak@linux.intel.com>
 111 Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
 112 Cc: Barry Song <baohua@kernel.org>
 113 Cc: Catalin Marinas <catalin.marinas@arm.com>
 114 Cc: Dave Hansen <dave.hansen@linux.intel.com>
 115 Cc: Hillf Danton <hdanton@sina.com>
 116 Cc: Jens Axboe <axboe@kernel.dk>
 117 Cc: Johannes Weiner <hannes@cmpxchg.org>
 118 Cc: Jonathan Corbet <corbet@lwn.net>
 119 Cc: Linus Torvalds <torvalds@linux-foundation.org>
 120 Cc: Matthew Wilcox <willy@infradead.org>
 121 Cc: Mel Gorman <mgorman@suse.de>
 122 Cc: Miaohe Lin <linmiaohe@huawei.com>
 123 Cc: Michael Larabel <Michael@MichaelLarabel.com>
 124 Cc: Michal Hocko <mhocko@kernel.org>
 125 Cc: Mike Rapoport <rppt@kernel.org>
 126 Cc: Mike Rapoport <rppt@linux.ibm.com>
 127 Cc: Peter Zijlstra <peterz@infradead.org>
 128 Cc: Qi Zheng <zhengqi.arch@bytedance.com>
 129 Cc: Tejun Heo <tj@kernel.org>
 130 Cc: Vlastimil Babka <vbabka@suse.cz>
 131 Cc: Will Deacon <will@kernel.org>
 132 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 133 ---
 134  mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
 135  1 file changed, 96 insertions(+), 9 deletions(-)
 136
 137 diff --git a/mm/vmscan.c b/mm/vmscan.c
 138 index a7844c689522..b6f6fc2585e1 100644
 139 --- a/mm/vmscan.c
 140 +++ b/mm/vmscan.c
 141 @@ -127,6 +127,12 @@ struct scan_control {
 142         /* Always discard instead of demoting to lower tier memory */
 143         unsigned int no_demotion:1;
 144
 145 +#ifdef CONFIG_LRU_GEN
 146 +       /* help kswapd make better choices among multiple memcgs */
 147 +       unsigned int memcgs_need_aging:1;
 148 +       unsigned long last_reclaimed;
 149 +#endif
 150 +
 151         /* Allocation order */
 152         s8 order;
 153
 154 @@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 155
 156         VM_WARN_ON_ONCE(!current_is_kswapd());
 157
 158 +       sc->last_reclaimed = sc->nr_reclaimed;
 159 +
 160 +       /*
 161 +        * To reduce the chance of going into the aging path, which can be
 162 +        * costly, optimistically skip it if the flag below was cleared in the
 163 +        * eviction path. This improves the overall performance when multiple
 164 +        * memcgs are available.
 165 +        */
 166 +       if (!sc->memcgs_need_aging) {
 167 +               sc->memcgs_need_aging = true;
 168 +               return;
 169 +       }
 170 +
 171         set_mm_walk(pgdat);
 172
 173         memcg = mem_cgroup_iter(NULL, NULL, NULL);
 174 @@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swa
 175         return scanned;
 176  }
 177
 178 -static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
 179 +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
 180 +                      bool *need_swapping)
 181  {
 182         int type;
 183         int scanned;
 184 @@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
 185
 186         sc->nr_reclaimed += reclaimed;
 187
 188 +       if (need_swapping && type == LRU_GEN_ANON)
 189 +               *need_swapping = true;
 190 +
 191         return scanned;
 192  }
 193
 194 @@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
 195   *    reclaim.
 196   */
 197  static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
 198 -                                   bool can_swap)
 199 +                                   bool can_swap, bool *need_aging)
 200  {
 201 -       bool need_aging;
 202         unsigned long nr_to_scan;
 203         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 204         DEFINE_MAX_SEQ(lruvec);
 205 @@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
 206             (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
 207                 return 0;
 208
 209 -       need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
 210 -       if (!need_aging)
 211 +       *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
 212 +       if (!*need_aging)
 213                 return nr_to_scan;
 214
 215         /* skip the aging path at the default priority */
 216 @@ -4715,10 +4737,68 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
 217         return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
 218  }
 219
 220 +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
 221 +                             struct scan_control *sc, bool need_swapping)
 222 +{
 223 +       int i;
 224 +       DEFINE_MAX_SEQ(lruvec);
 225 +
 226 +       if (!current_is_kswapd()) {
 227 +               /* age each memcg once to ensure fairness */
 228 +               if (max_seq - seq > 1)
 229 +                       return true;
 230 +
 231 +               /* over-swapping can increase allocation latency */
 232 +               if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
 233 +                       return true;
 234 +
 235 +               /* give this thread a chance to exit and free its memory */
 236 +               if (fatal_signal_pending(current)) {
 237 +                       sc->nr_reclaimed += MIN_LRU_BATCH;
 238 +                       return true;
 239 +               }
 240 +
 241 +               if (cgroup_reclaim(sc))
 242 +                       return false;
 243 +       } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
 244 +               return false;
 245 +
 246 +       /* keep scanning at low priorities to ensure fairness */
 247 +       if (sc->priority > DEF_PRIORITY - 2)
 248 +               return false;
 249 +
 250 +       /*
 251 +        * A minimum amount of work was done under global memory pressure. For
 252 +        * kswapd, it may be overshooting. For direct reclaim, the target isn't
 253 +        * met, and yet the allocation may still succeed, since kswapd may have
 254 +        * caught up. In either case, it's better to stop now, and restart if
 255 +        * necessary.
 256 +        */
 257 +       for (i = 0; i <= sc->reclaim_idx; i++) {
 258 +               unsigned long wmark;
 259 +               struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
 260 +
 261 +               if (!managed_zone(zone))
 262 +                       continue;
 263 +
 264 +               wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
 265 +               if (wmark > zone_page_state(zone, NR_FREE_PAGES))
 266 +                       return false;
 267 +       }
 268 +
 269 +       sc->nr_reclaimed += MIN_LRU_BATCH;
 270 +
 271 +       return true;
 272 +}
 273 +
 274  static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 275  {
 276         struct blk_plug plug;
 277 +       bool need_aging = false;
 278 +       bool need_swapping = false;
 279         unsigned long scanned = 0;
 280 +       unsigned long reclaimed = sc->nr_reclaimed;
 281 +       DEFINE_MAX_SEQ(lruvec);
 282
 283         lru_add_drain();
 284
 285 @@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 286                 else
 287                         swappiness = 0;
 288
 289 -               nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
 290 +               nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
 291                 if (!nr_to_scan)
 292 -                       break;
 293 +                       goto done;
 294
 295 -               delta = evict_pages(lruvec, sc, swappiness);
 296 +               delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
 297                 if (!delta)
 298 -                       break;
 299 +                       goto done;
 300
 301                 scanned += delta;
 302                 if (scanned >= nr_to_scan)
 303                         break;
 304
 305 +               if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
 306 +                       break;
 307 +
 308                 cond_resched();
 309         }
 310
 311 +       /* see the comment in lru_gen_age_node() */
 312 +       if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
 313 +               sc->memcgs_need_aging = false;
 314 +done:
 315         clear_mm_walk();
 316
 317         blk_finish_plug(&plug);
 318 --
 319 2.40.0
 320