target/linux/bcm27xx/patches-6.1/950-0705-drm-v3d-New-debugfs-end-points-to-query-GPU-usage-st.patch

   1 From 2725d86da22e2d2e46d970f0b2f2193a0b8e653b Mon Sep 17 00:00:00 2001
   2 From: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
   3 Date: Tue, 7 Feb 2023 13:54:02 +0100
   4 Subject: [PATCH] drm/v3d: New debugfs end-points to query GPU usage
   5  stats.
   6
   7 Two new debugfs interfaces are implemented:
   8
   9 - gpu_usage: exposes the total runtime since boot of each
  10 of the 5 scheduling queues available at V3D (BIN, RENDER,
  11 CSD, TFU, CACHE_CLEAN). So if the interface is queried at
  12 two different points of time the usage percentage of each
  13 of the queues can be calculated.
  14
  15 - gpu_pid_usage: exposes the same information but to the
  16 level of detail of each process using the V3D driver. The
  17 runtime for process using the driver is stored. So the
  18 percentages of usage by PID can be calculated with
  19 measures at different timestamps.
  20
  21 The storage of gpu_pid_usage stats is only done if
  22 the debugfs interface is polled during the last 70 seconds.
  23 If a process does not submit a GPU job during last 70
  24 seconds its stats will also be purged.
  25
  26 Signed-off-by: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
  27 ---
  28  drivers/gpu/drm/v3d/v3d_debugfs.c |  79 +++++++++++++++++
  29  drivers/gpu/drm/v3d/v3d_drv.h     |  59 +++++++++++++
  30  drivers/gpu/drm/v3d/v3d_gem.c     |   1 +
  31  drivers/gpu/drm/v3d/v3d_irq.c     |   5 ++
  32  drivers/gpu/drm/v3d/v3d_sched.c   | 139 +++++++++++++++++++++++++++++-
  33  5 files changed, 282 insertions(+), 1 deletion(-)
  34
  35 --- a/drivers/gpu/drm/v3d/v3d_debugfs.c
  36 +++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
  37 @@ -6,6 +6,7 @@
  38  #include <linux/debugfs.h>
  39  #include <linux/seq_file.h>
  40  #include <linux/string_helpers.h>
  41 +#include <linux/sched/clock.h>
  42
  43  #include <drm/drm_debugfs.h>
  44
  45 @@ -202,6 +203,82 @@ static int v3d_debugfs_bo_stats(struct s
  46         return 0;
  47  }
  48
  49 +static int v3d_debugfs_gpu_usage(struct seq_file *m, void *unused)
  50 +{
  51 +       struct drm_info_node *node = (struct drm_info_node *)m->private;
  52 +       struct drm_device *dev = node->minor->dev;
  53 +       struct v3d_dev *v3d = to_v3d_dev(dev);
  54 +       struct v3d_queue_stats *queue_stats;
  55 +       enum v3d_queue queue;
  56 +       u64 timestamp = local_clock();
  57 +       u64 active_runtime;
  58 +
  59 +       seq_printf(m, "timestamp;%llu;\n", local_clock());
  60 +       seq_printf(m, "\"QUEUE\";\"JOBS\";\"RUNTIME\";\"ACTIVE\";\n");
  61 +       for (queue = 0; queue < V3D_MAX_QUEUES; queue++) {
  62 +               if (!v3d->queue[queue].sched.ready)
  63 +                       continue;
  64 +
  65 +               queue_stats = &v3d->gpu_queue_stats[queue];
  66 +               mutex_lock(&queue_stats->lock);
  67 +               v3d_sched_stats_update(queue_stats);
  68 +               if (queue_stats->last_pid)
  69 +                       active_runtime = timestamp - queue_stats->last_exec_start;
  70 +               else
  71 +                       active_runtime = 0;
  72 +
  73 +               seq_printf(m, "%s;%d;%llu;%c;\n",
  74 +                          v3d_queue_to_string(queue),
  75 +                          queue_stats->jobs_sent,
  76 +                          queue_stats->runtime + active_runtime,
  77 +                          queue_stats->last_pid?'1':'0');
  78 +               mutex_unlock(&queue_stats->lock);
  79 +       }
  80 +
  81 +       return 0;
  82 +}
  83 +
  84 +static int v3d_debugfs_gpu_pid_usage(struct seq_file *m, void *unused)
  85 +{
  86 +       struct drm_info_node *node = (struct drm_info_node *)m->private;
  87 +       struct drm_device *dev = node->minor->dev;
  88 +       struct v3d_dev *v3d = to_v3d_dev(dev);
  89 +       struct v3d_queue_stats *queue_stats;
  90 +       struct v3d_queue_pid_stats *cur;
  91 +       enum v3d_queue queue;
  92 +       u64 active_runtime;
  93 +       u64 timestamp = local_clock();
  94 +
  95 +       seq_printf(m, "timestamp;%llu;\n", timestamp);
  96 +       seq_printf(m, "\"QUEUE\";\"PID\",\"JOBS\";\"RUNTIME\";\"ACTIVE\";\n");
  97 +       for (queue = 0; queue < V3D_MAX_QUEUES; queue++) {
  98 +
  99 +               if (!v3d->queue[queue].sched.ready)
 100 +                       continue;
 101 +
 102 +               queue_stats = &v3d->gpu_queue_stats[queue];
 103 +               mutex_lock(&queue_stats->lock);
 104 +               queue_stats->gpu_pid_stats_timeout = jiffies + V3D_QUEUE_STATS_TIMEOUT;
 105 +               v3d_sched_stats_update(queue_stats);
 106 +               list_for_each_entry(cur, &queue_stats->pid_stats_list, list) {
 107 +
 108 +                       if (cur->pid == queue_stats->last_pid)
 109 +                               active_runtime = timestamp - queue_stats->last_exec_start;
 110 +                       else
 111 +                               active_runtime = 0;
 112 +
 113 +                       seq_printf(m, "%s;%d;%d;%llu;%c;\n",
 114 +                                  v3d_queue_to_string(queue),
 115 +                                  cur->pid, cur->jobs_sent,
 116 +                                  cur->runtime + active_runtime,
 117 +                                  cur->pid == queue_stats->last_pid ? '1' : '0');
 118 +               }
 119 +               mutex_unlock(&queue_stats->lock);
 120 +       }
 121 +
 122 +       return 0;
 123 +}
 124 +
 125  static int v3d_measure_clock(struct seq_file *m, void *unused)
 126  {
 127         struct drm_info_node *node = (struct drm_info_node *)m->private;
 128 @@ -241,6 +318,8 @@ static const struct drm_info_list v3d_de
 129         {"v3d_regs", v3d_v3d_debugfs_regs, 0},
 130         {"measure_clock", v3d_measure_clock, 0},
 131         {"bo_stats", v3d_debugfs_bo_stats, 0},
 132 +       {"gpu_usage", v3d_debugfs_gpu_usage, 0},
 133 +       {"gpu_pid_usage", v3d_debugfs_gpu_pid_usage, 0},
 134  };
 135
 136  void
 137 --- a/drivers/gpu/drm/v3d/v3d_drv.h
 138 +++ b/drivers/gpu/drm/v3d/v3d_drv.h
 139 @@ -21,6 +21,19 @@ struct reset_control;
 140
 141  #define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1)
 142
 143 +static inline char *
 144 +v3d_queue_to_string(enum v3d_queue queue)
 145 +{
 146 +       switch (queue) {
 147 +       case V3D_BIN: return "v3d_bin";
 148 +       case V3D_RENDER: return "v3d_render";
 149 +       case V3D_TFU: return "v3d_tfu";
 150 +       case V3D_CSD: return "v3d_csd";
 151 +       case V3D_CACHE_CLEAN: return "v3d_cache_clean";
 152 +       }
 153 +       return "UNKNOWN";
 154 +}
 155 +
 156  struct v3d_queue_state {
 157         struct drm_gpu_scheduler sched;
 158
 159 @@ -28,6 +41,44 @@ struct v3d_queue_state {
 160         u64 emit_seqno;
 161  };
 162
 163 +struct v3d_queue_pid_stats {
 164 +       struct  list_head list;
 165 +       u64     runtime;
 166 +       /* Time in jiffes.to purge the stats of this process. Every time a
 167 +        * process sends a new job to the queue, this timeout is delayed by
 168 +        * V3D_QUEUE_STATS_TIMEOUT while the gpu_pid_stats_timeout of the
 169 +        * queue is not reached.
 170 +        */
 171 +       unsigned long timeout_purge;
 172 +       u32     jobs_sent;
 173 +       pid_t   pid;
 174 +};
 175 +
 176 +struct v3d_queue_stats {
 177 +       struct mutex lock;
 178 +       u64     last_exec_start;
 179 +       u64     last_exec_end;
 180 +       u64     runtime;
 181 +       u32     jobs_sent;
 182 +       /* Time in jiffes to stop collecting gpu stats by process. This is
 183 +        * increased by every access to*the debugfs interface gpu_pid_usage.
 184 +        * If the debugfs is not used stats are not collected.
 185 +        */
 186 +       unsigned long gpu_pid_stats_timeout;
 187 +       pid_t   last_pid;
 188 +       struct list_head pid_stats_list;
 189 +};
 190 +
 191 +/* pid_stats by process (v3d_queue_pid_stats) are recorded if there is an
 192 + * access to the gpu_pid_usageare debugfs interface for the last
 193 + * V3D_QUEUE_STATS_TIMEOUT (70s).
 194 + *
 195 + * The same timeout is used to purge the stats by process for those process
 196 + * that have not sent jobs this period.
 197 + */
 198 +#define V3D_QUEUE_STATS_TIMEOUT (70 * HZ)
 199 +
 200 +
 201  /* Performance monitor object. The perform lifetime is controlled by userspace
 202   * using perfmon related ioctls. A perfmon can be attached to a submit_cl
 203   * request, and when this is the case, HW perf counters will be activated just
 204 @@ -147,6 +198,8 @@ struct v3d_dev {
 205                 u32 num_allocated;
 206                 u32 pages_allocated;
 207         } bo_stats;
 208 +
 209 +       struct v3d_queue_stats gpu_queue_stats[V3D_MAX_QUEUES];
 210  };
 211
 212  static inline struct v3d_dev *
 213 @@ -244,6 +297,11 @@ struct v3d_job {
 214          */
 215         struct v3d_perfmon *perfmon;
 216
 217 +       /* PID of the process that submitted the job that could be used to
 218 +        * for collecting stats by process of gpu usage.
 219 +        */
 220 +       pid_t client_pid;
 221 +
 222         /* Callback for the freeing of the job on refcount going to 0. */
 223         void (*free)(struct kref *ref);
 224  };
 225 @@ -408,6 +466,7 @@ void v3d_mmu_remove_ptes(struct v3d_bo *
 226  /* v3d_sched.c */
 227  int v3d_sched_init(struct v3d_dev *v3d);
 228  void v3d_sched_fini(struct v3d_dev *v3d);
 229 +void v3d_sched_stats_update(struct v3d_queue_stats *queue_stats);
 230
 231  /* v3d_perfmon.c */
 232  void v3d_perfmon_get(struct v3d_perfmon *perfmon);
 233 --- a/drivers/gpu/drm/v3d/v3d_gem.c
 234 +++ b/drivers/gpu/drm/v3d/v3d_gem.c
 235 @@ -516,6 +516,7 @@ v3d_job_init(struct v3d_dev *v3d, struct
 236         job = *container;
 237         job->v3d = v3d;
 238         job->free = free;
 239 +       job->client_pid = current->pid;
 240
 241         ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
 242                                  v3d_priv);
 243 --- a/drivers/gpu/drm/v3d/v3d_irq.c
 244 +++ b/drivers/gpu/drm/v3d/v3d_irq.c
 245 @@ -14,6 +14,7 @@
 246   */
 247
 248  #include <linux/platform_device.h>
 249 +#include <linux/sched/clock.h>
 250
 251  #include "v3d_drv.h"
 252  #include "v3d_regs.h"
 253 @@ -100,6 +101,7 @@ v3d_irq(int irq, void *arg)
 254         if (intsts & V3D_INT_FLDONE) {
 255                 struct v3d_fence *fence =
 256                         to_v3d_fence(v3d->bin_job->base.irq_fence);
 257 +               v3d->gpu_queue_stats[V3D_BIN].last_exec_end = local_clock();
 258
 259                 trace_v3d_bcl_irq(&v3d->drm, fence->seqno);
 260                 dma_fence_signal(&fence->base);
 261 @@ -109,6 +111,7 @@ v3d_irq(int irq, void *arg)
 262         if (intsts & V3D_INT_FRDONE) {
 263                 struct v3d_fence *fence =
 264                         to_v3d_fence(v3d->render_job->base.irq_fence);
 265 +               v3d->gpu_queue_stats[V3D_RENDER].last_exec_end = local_clock();
 266
 267                 trace_v3d_rcl_irq(&v3d->drm, fence->seqno);
 268                 dma_fence_signal(&fence->base);
 269 @@ -118,6 +121,7 @@ v3d_irq(int irq, void *arg)
 270         if (intsts & V3D_INT_CSDDONE) {
 271                 struct v3d_fence *fence =
 272                         to_v3d_fence(v3d->csd_job->base.irq_fence);
 273 +               v3d->gpu_queue_stats[V3D_CSD].last_exec_end = local_clock();
 274
 275                 trace_v3d_csd_irq(&v3d->drm, fence->seqno);
 276                 dma_fence_signal(&fence->base);
 277 @@ -154,6 +158,7 @@ v3d_hub_irq(int irq, void *arg)
 278         if (intsts & V3D_HUB_INT_TFUC) {
 279                 struct v3d_fence *fence =
 280                         to_v3d_fence(v3d->tfu_job->base.irq_fence);
 281 +               v3d->gpu_queue_stats[V3D_TFU].last_exec_end = local_clock();
 282
 283                 trace_v3d_tfu_irq(&v3d->drm, fence->seqno);
 284                 dma_fence_signal(&fence->base);
 285 --- a/drivers/gpu/drm/v3d/v3d_sched.c
 286 +++ b/drivers/gpu/drm/v3d/v3d_sched.c
 287 @@ -19,6 +19,7 @@
 288   */
 289
 290  #include <linux/kthread.h>
 291 +#include <linux/sched/clock.h>
 292
 293  #include "v3d_drv.h"
 294  #include "v3d_regs.h"
 295 @@ -72,6 +73,114 @@ v3d_switch_perfmon(struct v3d_dev *v3d,
 296                 v3d_perfmon_start(v3d, job->perfmon);
 297  }
 298
 299 +/*
 300 + * Updates the scheduling stats of the gpu queues runtime for completed jobs.
 301 + *
 302 + * It should be called before any new job submission to the queue or before
 303 + * accessing the stats from the debugfs interface.
 304 + *
 305 + * It is expected that calls to this function are done with queue_stats->lock
 306 + * locked.
 307 + */
 308 +void
 309 +v3d_sched_stats_update(struct v3d_queue_stats *queue_stats)
 310 +{
 311 +       struct list_head *pid_stats_list = &queue_stats->pid_stats_list;
 312 +       struct v3d_queue_pid_stats *cur, *tmp;
 313 +       u64 runtime = 0;
 314 +       bool store_pid_stats =
 315 +               time_is_after_jiffies(queue_stats->gpu_pid_stats_timeout);
 316 +
 317 +       /* If debugfs stats gpu_pid_usage has not been polled for a period,
 318 +        * the pid stats collection is stopped and we purge any existing
 319 +        * pid_stats.
 320 +        *
 321 +        * pid_stats are also purged for clients that have reached the
 322 +        * timeout_purge because the process probably does not exist anymore.
 323 +        */
 324 +       list_for_each_entry_safe_reverse(cur, tmp, pid_stats_list, list) {
 325 +               if (!store_pid_stats || time_is_before_jiffies(cur->timeout_purge)) {
 326 +                       list_del(&cur->list);
 327 +                       kfree(cur);
 328 +               } else {
 329 +                       break;
 330 +               }
 331 +       }
 332 +       /* If a job has finished its stats are updated. */
 333 +       if (queue_stats->last_pid && queue_stats->last_exec_end) {
 334 +               runtime = queue_stats->last_exec_end -
 335 +                         queue_stats->last_exec_start;
 336 +               queue_stats->runtime += runtime;
 337 +
 338 +               if (store_pid_stats) {
 339 +                       struct v3d_queue_pid_stats *pid_stats;
 340 +                       /* Last job info is always at the head of the list */
 341 +                       pid_stats = list_first_entry_or_null(pid_stats_list,
 342 +                               struct v3d_queue_pid_stats, list);
 343 +                       if (pid_stats &&
 344 +                           pid_stats->pid == queue_stats->last_pid) {
 345 +                               pid_stats->runtime += runtime;
 346 +                       }
 347 +               }
 348 +               queue_stats->last_pid = 0;
 349 +       }
 350 +}
 351 +
 352 +/*
 353 + * Updates the queue usage adding the information of a new job that is
 354 + * about to be sent to the GPU to be executed.
 355 + */
 356 +int
 357 +v3d_sched_stats_add_job(struct v3d_queue_stats *queue_stats,
 358 +                       struct drm_sched_job *sched_job)
 359 +{
 360 +
 361 +       struct v3d_queue_pid_stats *pid_stats = NULL;
 362 +       struct v3d_job *job = sched_job?to_v3d_job(sched_job):NULL;
 363 +       struct v3d_queue_pid_stats *cur;
 364 +       struct list_head *pid_stats_list = &queue_stats->pid_stats_list;
 365 +       int ret = 0;
 366 +
 367 +       mutex_lock(&queue_stats->lock);
 368 +
 369 +       /* Completion of previous job requires an update of its runtime stats */
 370 +       v3d_sched_stats_update(queue_stats);
 371 +
 372 +       queue_stats->last_exec_start = local_clock();
 373 +       queue_stats->last_exec_end = 0;
 374 +       queue_stats->jobs_sent++;
 375 +       queue_stats->last_pid = job->client_pid;
 376 +
 377 +       /* gpu usage stats by process are being collected */
 378 +       if (time_is_after_jiffies(queue_stats->gpu_pid_stats_timeout)) {
 379 +               list_for_each_entry(cur, pid_stats_list, list) {
 380 +                       if (cur->pid == job->client_pid) {
 381 +                               pid_stats = cur;
 382 +                               break;
 383 +                       }
 384 +               }
 385 +               /* pid_stats of this client is moved to the head of the list. */
 386 +               if (pid_stats) {
 387 +                       list_move(&pid_stats->list, pid_stats_list);
 388 +               } else {
 389 +                       pid_stats = kzalloc(sizeof(struct v3d_queue_pid_stats),
 390 +                                           GFP_KERNEL);
 391 +                       if (!pid_stats) {
 392 +                               ret = -ENOMEM;
 393 +                               goto err_mem;
 394 +                       }
 395 +                       pid_stats->pid = job->client_pid;
 396 +                       list_add(&pid_stats->list, pid_stats_list);
 397 +               }
 398 +               pid_stats->jobs_sent++;
 399 +               pid_stats->timeout_purge = jiffies + V3D_QUEUE_STATS_TIMEOUT;
 400 +       }
 401 +
 402 +err_mem:
 403 +       mutex_unlock(&queue_stats->lock);
 404 +       return ret;
 405 +}
 406 +
 407  static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
 408  {
 409         struct v3d_bin_job *job = to_bin_job(sched_job);
 410 @@ -107,6 +216,7 @@ static struct dma_fence *v3d_bin_job_run
 411         trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
 412                             job->start, job->end);
 413
 414 +       v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_BIN], sched_job);
 415         v3d_switch_perfmon(v3d, &job->base);
 416
 417         /* Set the current and end address of the control list.
 418 @@ -158,6 +268,7 @@ static struct dma_fence *v3d_render_job_
 419         trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
 420                             job->start, job->end);
 421
 422 +       v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_RENDER], sched_job);
 423         v3d_switch_perfmon(v3d, &job->base);
 424
 425         /* XXX: Set the QCFG */
 426 @@ -190,6 +301,7 @@ v3d_tfu_job_run(struct drm_sched_job *sc
 427
 428         trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
 429
 430 +       v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_TFU], sched_job);
 431         V3D_WRITE(V3D_TFU_IIA, job->args.iia);
 432         V3D_WRITE(V3D_TFU_IIS, job->args.iis);
 433         V3D_WRITE(V3D_TFU_ICA, job->args.ica);
 434 @@ -231,6 +343,7 @@ v3d_csd_job_run(struct drm_sched_job *sc
 435
 436         trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
 437
 438 +       v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_CSD], sched_job);
 439         v3d_switch_perfmon(v3d, &job->base);
 440
 441         for (i = 1; i <= 6; i++)
 442 @@ -247,7 +360,10 @@ v3d_cache_clean_job_run(struct drm_sched
 443         struct v3d_job *job = to_v3d_job(sched_job);
 444         struct v3d_dev *v3d = job->v3d;
 445
 446 +       v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_CACHE_CLEAN],
 447 +                               sched_job);
 448         v3d_clean_caches(v3d);
 449 +       v3d->gpu_queue_stats[V3D_CACHE_CLEAN].last_exec_end = local_clock();
 450
 451         return NULL;
 452  }
 453 @@ -385,8 +501,18 @@ v3d_sched_init(struct v3d_dev *v3d)
 454         int hw_jobs_limit = 1;
 455         int job_hang_limit = 0;
 456         int hang_limit_ms = 500;
 457 +       enum v3d_queue q;
 458         int ret;
 459
 460 +       for (q = 0; q < V3D_MAX_QUEUES; q++) {
 461 +               INIT_LIST_HEAD(&v3d->gpu_queue_stats[q].pid_stats_list);
 462 +               /* Setting timeout before current jiffies disables collecting
 463 +                * pid_stats on scheduling init.
 464 +                */
 465 +               v3d->gpu_queue_stats[q].gpu_pid_stats_timeout = jiffies - 1;
 466 +               mutex_init(&v3d->gpu_queue_stats[q].lock);
 467 +       }
 468 +
 469         ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
 470                              &v3d_bin_sched_ops,
 471                              hw_jobs_limit, job_hang_limit,
 472 @@ -440,9 +566,20 @@ void
 473  v3d_sched_fini(struct v3d_dev *v3d)
 474  {
 475         enum v3d_queue q;
 476 +       struct v3d_queue_stats *queue_stats;
 477
 478         for (q = 0; q < V3D_MAX_QUEUES; q++) {
 479 -               if (v3d->queue[q].sched.ready)
 480 +               if (v3d->queue[q].sched.ready) {
 481 +                       queue_stats = &v3d->gpu_queue_stats[q];
 482 +                       mutex_lock(&queue_stats->lock);
 483 +                       /* Setting gpu_pid_stats_timeout to jiffies-1 will
 484 +                        * make v3d_sched_stats_update to purge all
 485 +                        * allocated pid_stats.
 486 +                        */
 487 +                       queue_stats->gpu_pid_stats_timeout = jiffies - 1;
 488 +                       v3d_sched_stats_update(queue_stats);
 489 +                       mutex_unlock(&queue_stats->lock);
 490                         drm_sched_fini(&v3d->queue[q].sched);
 491 +               }
 492         }
 493  }