Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next

[openwrt/staging/blogic.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 52c82b2c94dcc362fccdc4d477ccc704f5cd84c0..35f4cc024dcfcb17e183b82d44359e430b791f52 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -191,7 +191,7 @@ static void update_sysctl(void)
  #undef SET_SYSCTL
  }
  
-void sched_init_granularity(void)
+void __init sched_init_granularity(void)
  {
         update_sysctl();
  }
@@ -1093,7 +1093,7 @@ struct numa_group {
          * more by CPU use than by memory faults.
          */
         unsigned long *faults_cpu;
-       unsigned long faults[0];
+       unsigned long faults[];
  };
  
  /*
@@ -3440,52 +3440,46 @@ static inline void
  update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
  {
         long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
  
         /* Nothing to update */
         if (!delta)
                 return;
  
-       /*
-        * The relation between sum and avg is:
-        *
-        *   LOAD_AVG_MAX - 1024 + sa->period_contrib
-        *
-        * however, the PELT windows are not aligned between grq and gse.
-        */
-
         /* Set new sched_entity's utilization */
         se->avg.util_avg = gcfs_rq->avg.util_avg;
-       se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+       se->avg.util_sum = se->avg.util_avg * divider;
  
         /* Update parent cfs_rq utilization */
         add_positive(&cfs_rq->avg.util_avg, delta);
-       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
  }
  
  static inline void
  update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
  {
         long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
  
         /* Nothing to update */
         if (!delta)
                 return;
  
-       /*
-        * The relation between sum and avg is:
-        *
-        *   LOAD_AVG_MAX - 1024 + sa->period_contrib
-        *
-        * however, the PELT windows are not aligned between grq and gse.
-        */
-
         /* Set new sched_entity's runnable */
         se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
-       se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+       se->avg.runnable_sum = se->avg.runnable_avg * divider;
  
         /* Update parent cfs_rq runnable */
         add_positive(&cfs_rq->avg.runnable_avg, delta);
-       cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
+       cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
  }
  
  static inline void
@@ -3495,19 +3489,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
         unsigned long load_avg;
         u64 load_sum = 0;
         s64 delta_sum;
+       u32 divider;
  
         if (!runnable_sum)
                 return;
  
         gcfs_rq->prop_runnable_sum = 0;
  
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
+       divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
         if (runnable_sum >= 0) {
                 /*
                  * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
                  * the CPU is saturated running == runnable.
                  */
                 runnable_sum += se->avg.load_sum;
-               runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+               runnable_sum = min_t(long, runnable_sum, divider);
         } else {
                 /*
                  * Estimate the new unweighted runnable_sum of the gcfs_rq by
@@ -3532,7 +3533,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
         runnable_sum = max(runnable_sum, running_sum);
  
         load_sum = (s64)se_weight(se) * runnable_sum;
-       load_avg = div_s64(load_sum, LOAD_AVG_MAX);
+       load_avg = div_s64(load_sum, divider);
  
         delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
         delta_avg = load_avg - se->avg.load_avg;
@@ -3696,6 +3697,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
   */
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
+       /*
+        * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+        * See ___update_load_avg() for details.
+        */
         u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
  
         /*
@@ -3872,6 +3877,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
         return cfs_rq->avg.load_avg;
  }
  
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
  static inline unsigned long task_util(struct task_struct *p)
  {
         return READ_ONCE(p->se.avg.util_avg);
@@ -4053,7 +4060,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
  detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
  {
         return 0;
  }
@@ -4587,16 +4594,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  }
  
  /* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+                                  struct cfs_rq *cfs_rq, u64 target_runtime)
  {
-       struct task_group *tg = cfs_rq->tg;
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount;
+       u64 min_amount, amount = 0;
+
+       lockdep_assert_held(&cfs_b->lock);
  
         /* note: this is a positive sum as runtime_remaining <= 0 */
-       min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+       min_amount = target_runtime - cfs_rq->runtime_remaining;
  
-       raw_spin_lock(&cfs_b->lock);
         if (cfs_b->quota == RUNTIME_INF)
                 amount = min_amount;
         else {
@@ -4608,13 +4615,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                         cfs_b->idle = 0;
                 }
         }
-       raw_spin_unlock(&cfs_b->lock);
  
         cfs_rq->runtime_remaining += amount;
  
         return cfs_rq->runtime_remaining > 0;
  }
  
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       int ret;
+
+       raw_spin_lock(&cfs_b->lock);
+       ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+       raw_spin_unlock(&cfs_b->lock);
+
+       return ret;
+}
+
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
         /* dock delta_exec before expiring quota (as it could span periods) */
@@ -4703,13 +4722,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
         return 0;
  }
  
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
  {
         struct rq *rq = rq_of(cfs_rq);
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         long task_delta, idle_task_delta, dequeue = 1;
-       bool empty;
+
+       raw_spin_lock(&cfs_b->lock);
+       /* This will start the period timer if necessary */
+       if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+               /*
+                * We have raced with bandwidth becoming available, and if we
+                * actually throttled the timer might not unthrottle us for an
+                * entire period. We additionally needed to make sure that any
+                * subsequent check_cfs_rq_runtime calls agree not to throttle
+                * us, as we may commit to do cfs put_prev+pick_next, so we ask
+                * for 1ns of runtime rather than just check cfs_b.
+                */
+               dequeue = 0;
+       } else {
+               list_add_tail_rcu(&cfs_rq->throttled_list,
+                                 &cfs_b->throttled_cfs_rq);
+       }
+       raw_spin_unlock(&cfs_b->lock);
+
+       if (!dequeue)
+               return false;  /* Throttle no longer required. */
  
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
  
@@ -4743,29 +4782,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         if (!se)
                 sub_nr_running(rq, task_delta);
  
-       cfs_rq->throttled = 1;
-       cfs_rq->throttled_clock = rq_clock(rq);
-       raw_spin_lock(&cfs_b->lock);
-       empty = list_empty(&cfs_b->throttled_cfs_rq);
-
         /*
-        * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
-        * not running add to the tail so that later runqueues don't get starved.
+        * Note: distribution will already see us throttled via the
+        * throttled-list.  rq->lock protects completion.
          */
-       if (cfs_b->distribute_running)
-               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-       else
-               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
-       /*
-        * If we're the first throttled task, make sure the bandwidth
-        * timer is running.
-        */
-       if (empty)
-               start_cfs_bandwidth(cfs_b);
-
-       raw_spin_unlock(&cfs_b->lock);
+       cfs_rq->throttled = 1;
+       cfs_rq->throttled_clock = rq_clock(rq);
+       return true;
  }
  
  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4932,14 +4955,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
         /*
          * This check is repeated as we release cfs_b->lock while we unthrottle.
          */
-       while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
-               cfs_b->distribute_running = 1;
+       while (throttled && cfs_b->runtime > 0) {
                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
                 distribute_cfs_runtime(cfs_b);
                 raw_spin_lock_irqsave(&cfs_b->lock, flags);
  
-               cfs_b->distribute_running = 0;
                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
         }
  
@@ -5053,10 +5074,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         /* confirm we're still not at a refresh boundary */
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
         cfs_b->slack_started = false;
-       if (cfs_b->distribute_running) {
-               raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
-               return;
-       }
  
         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
                 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
@@ -5066,9 +5083,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                 runtime = cfs_b->runtime;
  
-       if (runtime)
-               cfs_b->distribute_running = 1;
-
         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  
         if (!runtime)
@@ -5077,7 +5091,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         distribute_cfs_runtime(cfs_b);
  
         raw_spin_lock_irqsave(&cfs_b->lock, flags);
-       cfs_b->distribute_running = 0;
         raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  }
  
@@ -5138,8 +5151,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         if (cfs_rq_throttled(cfs_rq))
                 return true;
  
-       throttle_cfs_rq(cfs_rq);
-       return true;
+       return throttle_cfs_rq(cfs_rq);
  }
  
  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -5169,6 +5181,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                 if (!overrun)
                         break;
  
+               idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
                 if (++count > 3) {
                         u64 new, old = ktime_to_ns(cfs_b->period);
  
@@ -5198,8 +5212,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                         /* reset count so we don't come right back in here */
                         count = 0;
                 }
-
-               idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
         }
         if (idle)
                 cfs_b->period_active = 0;
@@ -5220,7 +5232,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         cfs_b->period_timer.function = sched_cfs_period_timer;
         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         cfs_b->slack_timer.function = sched_cfs_slack_timer;
-       cfs_b->distribute_running = 0;
         cfs_b->slack_started = false;
  }
  
@@ -5505,28 +5516,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                         list_add_leaf_cfs_rq(cfs_rq);
         }
  
-enqueue_throttle:
-       if (!se) {
-               add_nr_running(rq, 1);
-               /*
-                * Since new tasks are assigned an initial util_avg equal to
-                * half of the spare capacity of their CPU, tiny tasks have the
-                * ability to cross the overutilized threshold, which will
-                * result in the load balancer ruining all the task placement
-                * done by EAS. As a way to mitigate that effect, do not account
-                * for the first enqueue operation of new tasks during the
-                * overutilized flag detection.
-                *
-                * A better way of solving this problem would be to wait for
-                * the PELT signals of tasks to converge before taking them
-                * into account, but that is not straightforward to implement,
-                * and the following generally works well enough in practice.
-                */
-               if (flags & ENQUEUE_WAKEUP)
-                       update_overutilized_status(rq);
+       /* At this point se is NULL and we are at root level*/
+       add_nr_running(rq, 1);
  
-       }
+       /*
+        * Since new tasks are assigned an initial util_avg equal to
+        * half of the spare capacity of their CPU, tiny tasks have the
+        * ability to cross the overutilized threshold, which will
+        * result in the load balancer ruining all the task placement
+        * done by EAS. As a way to mitigate that effect, do not account
+        * for the first enqueue operation of new tasks during the
+        * overutilized flag detection.
+        *
+        * A better way of solving this problem would be to wait for
+        * the PELT signals of tasks to converge before taking them
+        * into account, but that is not straightforward to implement,
+        * and the following generally works well enough in practice.
+        */
+       if (flags & ENQUEUE_WAKEUP)
+               update_overutilized_status(rq);
  
+enqueue_throttle:
         if (cfs_bandwidth_used()) {
                 /*
                  * When bandwidth control is enabled; the cfs_rq_throttled()
@@ -5736,7 +5746,7 @@ static int wake_wide(struct task_struct *p)
  {
         unsigned int master = current->wakee_flips;
         unsigned int slave = p->wakee_flips;
-       int factor = this_cpu_read(sd_llc_size);
+       int factor = __this_cpu_read(sd_llc_size);
  
         if (master < slave)
                 swap(master, slave);
@@ -5845,8 +5855,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
  }
  
  static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag);
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
  
  /*
   * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5929,7 +5938,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
                         continue;
                 }
  
-               group = find_idlest_group(sd, p, cpu, sd_flag);
+               group = find_idlest_group(sd, p, cpu);
                 if (!group) {
                         sd = sd->child;
                         continue;
@@ -6670,9 +6679,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         rcu_read_lock();
         for_each_domain(cpu, tmp) {
-               if (!(tmp->flags & SD_LOAD_BALANCE))
-                       break;
-
                 /*
                  * If both 'cpu' and 'prev_cpu' are part of this domain,
                  * cpu is a valid SD_WAKE_AFFINE target.
@@ -8583,7 +8589,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
          */
  
  #ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
+       if (rq->ttwu_pending)
                 return 0;
  #endif
  
@@ -8701,8 +8707,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
   * Assumes p is allowed on at least one CPU in sd.
   */
  static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  {
         struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
         struct sg_lb_stats local_sgs, tmp_sgs;
@@ -9433,7 +9438,7 @@ static int active_load_balance_cpu_stop(void *data);
  static int should_we_balance(struct lb_env *env)
  {
         struct sched_group *sg = env->sd->groups;
-       int cpu, balance_cpu = -1;
+       int cpu;
  
         /*
          * Ensure the balancing environment is consistent; can happen
@@ -9454,18 +9459,12 @@ static int should_we_balance(struct lb_env *env)
                 if (!idle_cpu(cpu))
                         continue;
  
-               balance_cpu = cpu;
-               break;
+               /* Are we the first idle CPU? */
+               return cpu == env->dst_cpu;
         }
  
-       if (balance_cpu == -1)
-               balance_cpu = group_balance_cpu(sg);
-
-       /*
-        * First idle CPU or the first CPU(busiest) in this sched group
-        * is eligible for doing load balancing at this and above domains.
-        */
-       return balance_cpu == env->dst_cpu;
+       /* Are we the first CPU of this group ? */
+       return group_balance_cpu(sg) == env->dst_cpu;
  }
  
  /*
@@ -9818,9 +9817,8 @@ static int active_load_balance_cpu_stop(void *data)
         /* Search for an sd spanning us and the target CPU. */
         rcu_read_lock();
         for_each_domain(target_cpu, sd) {
-               if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                               break;
+               if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                       break;
         }
  
         if (likely(sd)) {
@@ -9909,9 +9907,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                 }
                 max_cost += sd->max_newidle_lb_cost;
  
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
                 /*
                  * Stop the load balance at this level. There is another
                  * CPU in our sched group which is doing load balancing more
@@ -10028,17 +10023,20 @@ static void kick_ilb(unsigned int flags)
         if (ilb_cpu >= nr_cpu_ids)
                 return;
  
+       /*
+        * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
+        * the first flag owns it; cleared by nohz_csd_func().
+        */
         flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
         if (flags & NOHZ_KICK_MASK)
                 return;
  
         /*
-        * Use smp_send_reschedule() instead of resched_cpu().
-        * This way we generate a sched IPI on the target CPU which
+        * This way we generate an IPI on the target CPU which
          * is idle. And the softirq performing nohz idle load balance
          * will be run before returning from the IPI.
          */
-       smp_send_reschedule(ilb_cpu);
+       smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
  }
  
  /*
@@ -10376,20 +10374,14 @@ abort:
   */
  static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
  {
-       int this_cpu = this_rq->cpu;
-       unsigned int flags;
+       unsigned int flags = this_rq->nohz_idle_balance;
  
-       if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+       if (!flags)
                 return false;
  
-       if (idle != CPU_IDLE) {
-               atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-               return false;
-       }
+       this_rq->nohz_idle_balance = 0;
  
-       /* could be _relaxed() */
-       flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-       if (!(flags & NOHZ_KICK_MASK))
+       if (idle != CPU_IDLE)
                 return false;
  
         _nohz_idle_balance(this_rq, flags, idle);
@@ -10449,7 +10441,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
   *     0 - failed, no new tasks
   *   > 0 - success, new (fair) tasks present
   */
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
  {
         unsigned long next_balance = jiffies + HZ;
         int this_cpu = this_rq->cpu;
@@ -10500,9 +10492,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
                 int continue_balancing = 1;
                 u64 t0, domain_cost;
  
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
                         update_next_balance(sd, &next_balance);
                         break;