target/linux/generic/backport-6.1/020-v6.3-14-UPSTREAM-mm-multi-gen-LRU-section-for-Bloom-filters.patch

   1 From 397624e12244ec038f51cb1f178ccb7a2ec562e5 Mon Sep 17 00:00:00 2001
   2 From: "T.J. Alumbaugh" <talumbau@google.com>
   3 Date: Wed, 18 Jan 2023 00:18:23 +0000
   4 Subject: [PATCH 14/19] UPSTREAM: mm: multi-gen LRU: section for Bloom filters
   5
   6 Move Bloom filters code into a dedicated section.  Improve the design doc
   7 to explain Bloom filter usage and connection between aging and eviction in
   8 their use.
   9
  10 Link: https://lkml.kernel.org/r/20230118001827.1040870-4-talumbau@google.com
  11 Change-Id: I73e866f687c1ed9f5c8538086aa39408b79897db
  12 Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
  13 Cc: Yu Zhao <yuzhao@google.com>
  14 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  15 (cherry picked from commit ccbbbb85945d8f0255aa9dbc1b617017e2294f2c)
  16 Bug: 274865848
  17 Signed-off-by: T.J. Mercier <tjmercier@google.com>
  18 ---
  19  Documentation/mm/multigen_lru.rst |  16 +++
  20  mm/vmscan.c                       | 180 +++++++++++++++---------------
  21  2 files changed, 108 insertions(+), 88 deletions(-)
  22
  23 diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
  24 index bd988a142bc2f..770b5d539856c 100644
  25 --- a/Documentation/mm/multigen_lru.rst
  26 +++ b/Documentation/mm/multigen_lru.rst
  27 @@ -170,6 +170,22 @@ promotes hot pages. If the scan was done cacheline efficiently, it
  28  adds the PMD entry pointing to the PTE table to the Bloom filter. This
  29  forms a feedback loop between the eviction and the aging.
  30
  31 +Bloom Filters
  32 +-------------
  33 +Bloom filters are a space and memory efficient data structure for set
  34 +membership test, i.e., test if an element is not in the set or may be
  35 +in the set.
  36 +
  37 +In the eviction path, specifically, in ``lru_gen_look_around()``, if a
  38 +PMD has a sufficient number of hot pages, its address is placed in the
  39 +filter. In the aging path, set membership means that the PTE range
  40 +will be scanned for young pages.
  41 +
  42 +Note that Bloom filters are probabilistic on set membership. If a test
  43 +is false positive, the cost is an additional scan of a range of PTEs,
  44 +which may yield hot pages anyway. Parameters of the filter itself can
  45 +control the false positive rate in the limit.
  46 +
  47  Summary
  48  -------
  49  The multi-gen LRU can be disassembled into the following parts:
  50 diff --git a/mm/vmscan.c b/mm/vmscan.c
  51 index 8fa82630240d6..74b4f9d660b56 100644
  52 --- a/mm/vmscan.c
  53 +++ b/mm/vmscan.c
  54 @@ -3208,6 +3208,98 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
  55                get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
  56  }
  57
  58 +/******************************************************************************
  59 + *                          Bloom filters
  60 + ******************************************************************************/
  61 +
  62 +/*
  63 + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
  64 + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
  65 + * bits in a bitmap, k is the number of hash functions and n is the number of
  66 + * inserted items.
  67 + *
  68 + * Page table walkers use one of the two filters to reduce their search space.
  69 + * To get rid of non-leaf entries that no longer have enough leaf entries, the
  70 + * aging uses the double-buffering technique to flip to the other filter each
  71 + * time it produces a new generation. For non-leaf entries that have enough
  72 + * leaf entries, the aging carries them over to the next generation in
  73 + * walk_pmd_range(); the eviction also report them when walking the rmap
  74 + * in lru_gen_look_around().
  75 + *
  76 + * For future optimizations:
  77 + * 1. It's not necessary to keep both filters all the time. The spare one can be
  78 + *    freed after the RCU grace period and reallocated if needed again.
  79 + * 2. And when reallocating, it's worth scaling its size according to the number
  80 + *    of inserted entries in the other filter, to reduce the memory overhead on
  81 + *    small systems and false positives on large systems.
  82 + * 3. Jenkins' hash function is an alternative to Knuth's.
  83 + */
  84 +#define BLOOM_FILTER_SHIFT     15
  85 +
  86 +static inline int filter_gen_from_seq(unsigned long seq)
  87 +{
  88 +       return seq % NR_BLOOM_FILTERS;
  89 +}
  90 +
  91 +static void get_item_key(void *item, int *key)
  92 +{
  93 +       u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
  94 +
  95 +       BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
  96 +
  97 +       key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
  98 +       key[1] = hash >> BLOOM_FILTER_SHIFT;
  99 +}
 100 +
 101 +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
 102 +{
 103 +       int key[2];
 104 +       unsigned long *filter;
 105 +       int gen = filter_gen_from_seq(seq);
 106 +
 107 +       filter = READ_ONCE(lruvec->mm_state.filters[gen]);
 108 +       if (!filter)
 109 +               return true;
 110 +
 111 +       get_item_key(item, key);
 112 +
 113 +       return test_bit(key[0], filter) && test_bit(key[1], filter);
 114 +}
 115 +
 116 +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
 117 +{
 118 +       int key[2];
 119 +       unsigned long *filter;
 120 +       int gen = filter_gen_from_seq(seq);
 121 +
 122 +       filter = READ_ONCE(lruvec->mm_state.filters[gen]);
 123 +       if (!filter)
 124 +               return;
 125 +
 126 +       get_item_key(item, key);
 127 +
 128 +       if (!test_bit(key[0], filter))
 129 +               set_bit(key[0], filter);
 130 +       if (!test_bit(key[1], filter))
 131 +               set_bit(key[1], filter);
 132 +}
 133 +
 134 +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
 135 +{
 136 +       unsigned long *filter;
 137 +       int gen = filter_gen_from_seq(seq);
 138 +
 139 +       filter = lruvec->mm_state.filters[gen];
 140 +       if (filter) {
 141 +               bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
 142 +               return;
 143 +       }
 144 +
 145 +       filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
 146 +                              __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
 147 +       WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
 148 +}
 149 +
 150  /******************************************************************************
 151   *                          mm_struct list
 152   ******************************************************************************/
 153 @@ -3333,94 +3425,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
 154  }
 155  #endif
 156
 157 -/*
 158 - * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
 159 - * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
 160 - * bits in a bitmap, k is the number of hash functions and n is the number of
 161 - * inserted items.
 162 - *
 163 - * Page table walkers use one of the two filters to reduce their search space.
 164 - * To get rid of non-leaf entries that no longer have enough leaf entries, the
 165 - * aging uses the double-buffering technique to flip to the other filter each
 166 - * time it produces a new generation. For non-leaf entries that have enough
 167 - * leaf entries, the aging carries them over to the next generation in
 168 - * walk_pmd_range(); the eviction also report them when walking the rmap
 169 - * in lru_gen_look_around().
 170 - *
 171 - * For future optimizations:
 172 - * 1. It's not necessary to keep both filters all the time. The spare one can be
 173 - *    freed after the RCU grace period and reallocated if needed again.
 174 - * 2. And when reallocating, it's worth scaling its size according to the number
 175 - *    of inserted entries in the other filter, to reduce the memory overhead on
 176 - *    small systems and false positives on large systems.
 177 - * 3. Jenkins' hash function is an alternative to Knuth's.
 178 - */
 179 -#define BLOOM_FILTER_SHIFT     15
 180 -
 181 -static inline int filter_gen_from_seq(unsigned long seq)
 182 -{
 183 -       return seq % NR_BLOOM_FILTERS;
 184 -}
 185 -
 186 -static void get_item_key(void *item, int *key)
 187 -{
 188 -       u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
 189 -
 190 -       BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
 191 -
 192 -       key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
 193 -       key[1] = hash >> BLOOM_FILTER_SHIFT;
 194 -}
 195 -
 196 -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
 197 -{
 198 -       unsigned long *filter;
 199 -       int gen = filter_gen_from_seq(seq);
 200 -
 201 -       filter = lruvec->mm_state.filters[gen];
 202 -       if (filter) {
 203 -               bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
 204 -               return;
 205 -       }
 206 -
 207 -       filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
 208 -                              __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
 209 -       WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
 210 -}
 211 -
 212 -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
 213 -{
 214 -       int key[2];
 215 -       unsigned long *filter;
 216 -       int gen = filter_gen_from_seq(seq);
 217 -
 218 -       filter = READ_ONCE(lruvec->mm_state.filters[gen]);
 219 -       if (!filter)
 220 -               return;
 221 -
 222 -       get_item_key(item, key);
 223 -
 224 -       if (!test_bit(key[0], filter))
 225 -               set_bit(key[0], filter);
 226 -       if (!test_bit(key[1], filter))
 227 -               set_bit(key[1], filter);
 228 -}
 229 -
 230 -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
 231 -{
 232 -       int key[2];
 233 -       unsigned long *filter;
 234 -       int gen = filter_gen_from_seq(seq);
 235 -
 236 -       filter = READ_ONCE(lruvec->mm_state.filters[gen]);
 237 -       if (!filter)
 238 -               return true;
 239 -
 240 -       get_item_key(item, key);
 241 -
 242 -       return test_bit(key[0], filter) && test_bit(key[1], filter);
 243 -}
 244 -
 245  static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
 246  {
 247         int i;
 248 --
 249 2.40.1
 250